Ejemplo n.º 1
0
 def validation_step(self, batch, batch_idx):
     if isinstance(batch, dict):
         batch = rlt.DiscreteDqnInput.from_dict(batch)
     # HACK: Move to cpu in order to hold more batches in memory
     # This is only needed when trainers need in-memory
     # EvaluationDataPages of the full evaluation dataset
     return EvaluationDataPage.create_from_training_batch(batch, self).cpu()
Ejemplo n.º 2
0
 def create_edp(self, environment, samples, epsilon_model):
     """Generate a EvaluationDataPage such that the model policy is epsilon
     greedy with parameter epsilon_model. The true values of this policy are
     used for the model_values* data.
     """
     tdp = environment.preprocess_samples(samples,
                                          len(samples.mdp_ids),
                                          do_shuffle=False)[0]
     # compute rewards, probs, values for all actions of each sampled state
     model_rewards = environment.true_rewards_all_actions_for_sample(
         samples.states)
     model_propensities = environment.policy_probabilities_for_sample(
         samples.states, epsilon_model)
     model_values = environment.true_epsilon_values_all_actions_for_sample(
         samples.states, epsilon_model)
     # compute rewards for logged action
     model_rewards_logged_action = environment.true_rewards_for_sample(
         samples.states, samples.actions)
     edp = EvaluationDataPage(
         mdp_id=np.array(samples.mdp_ids).reshape(-1, 1),
         sequence_number=torch.tensor(samples.sequence_numbers,
                                      dtype=torch.int),
         logged_propensities=tdp.propensities,
         logged_rewards=tdp.rewards,
         action_mask=tdp.actions,
         model_propensities=torch.tensor(model_propensities,
                                         dtype=torch.float32),
         model_rewards=torch.tensor(model_rewards, dtype=torch.float32),
         model_rewards_for_logged_action=torch.tensor(
             model_rewards_logged_action, dtype=torch.float32),
         model_values=torch.tensor(model_values, dtype=torch.float32),
         model_values_for_logged_action=None,
         possible_actions_mask=tdp.possible_actions_mask,
     )
     return edp
 def gather_eval_data(self, test_step_outputs):
     eval_data = None
     for batch in test_step_outputs:
         edp = EvaluationDataPage.create_from_training_batch(batch, self)
         if eval_data is None:
             eval_data = edp
         else:
             eval_data = eval_data.append(edp)
     if eval_data.mdp_id is not None:
         eval_data = eval_data.sort()
         eval_data = eval_data.compute_values(self.gamma)
         eval_data.validate()
     return eval_data
Ejemplo n.º 4
0
    def validation_step(self, batch: rlt.PreprocessedRankingInput,
                        batch_idx: int):
        seq2slate_net = self.seq2slate_net

        assert seq2slate_net.training is False

        logged_slate_rank_prob = torch.exp(
            seq2slate_net(batch, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).
            log_probs.detach().flatten().cpu())

        ranked_slate_output = seq2slate_net(batch,
                                            Seq2SlateMode.RANK_MODE,
                                            greedy=True)
        ranked_slate_rank_prob = ranked_slate_output.ranked_per_seq_probs.cpu()

        self.reporter.log(
            logged_slate_rank_probs=logged_slate_rank_prob,
            ranked_slate_rank_probs=ranked_slate_rank_prob,
        )

        if not self.calc_cpe:
            return

        edp_g = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            batch,
            eval_greedy=True,
        )

        edp_ng = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            batch,
            eval_greedy=False,
        )

        return edp_g, edp_ng
Ejemplo n.º 5
0
 def handle(self, tdp: PreprocessedTrainingBatch) -> None:
     if not self.trainer.calc_cpe_in_training:
         return
     # TODO: Perhaps we can make an RLTrainer param to check if continuous?
     if isinstance(self.trainer, (SACTrainer, TD3Trainer)):
         # TODO: Implement CPE for continuous algos
         edp = None
     else:
         edp = EvaluationDataPage.create_from_training_batch(
             tdp, self.trainer)
     if self.evaluation_data is None:
         self.evaluation_data = edp
     else:
         self.evaluation_data = self.evaluation_data.append(edp)
Ejemplo n.º 6
0
    def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
        cpe_details = CpeDetails()

        cpe_details.reward_estimates = self.score_cpe("Reward", edp)

        if (
            self.metrics_to_score is not None
            and edp.logged_metrics is not None
            and self.action_names is not None
        ):
            for i, metric in enumerate(self.metrics_to_score):
                logger.info(
                    "--------- Running CPE on metric: {} ---------".format(metric)
                )

                metric_reward_edp = edp.set_metric_as_reward(i, len(self.action_names))

                cpe_details.metric_estimates[metric] = self.score_cpe(
                    metric, metric_reward_edp
                )

        if self.action_names is not None:
            if edp.optimal_q_values is not None:
                value_means = edp.optimal_q_values.mean(dim=0)
                cpe_details.q_value_means = {
                    action: float(value_means[i])
                    for i, action in enumerate(self.action_names)
                }
                value_stds = edp.optimal_q_values.std(dim=0)
                cpe_details.q_value_stds = {
                    action: float(value_stds[i])
                    for i, action in enumerate(self.action_names)
                }
            if edp.eval_action_idxs is not None:
                cpe_details.action_distribution = {
                    # pyre-fixme[6]: Expected `Union[_SupportsIndex, bytearray,
                    #  bytes, str, typing.SupportsFloat]` for 1st param but got
                    #  `ByteTensor`.
                    action: float((edp.eval_action_idxs == i).sum())
                    / edp.eval_action_idxs.shape[0]
                    for i, action in enumerate(self.action_names)
                }
        # Compute MC Loss on Aggregate Reward
        cpe_details.mc_loss = float(
            F.mse_loss(edp.logged_values, edp.model_values_for_logged_action)
        )
        # pyre-fixme[16]: `Evaluator` has no attribute `notify_observers`.
        self.notify_observers(cpe_details=cpe_details)
        return cpe_details
Ejemplo n.º 7
0
 def gather_eval_data(self, validation_step_outputs):
     was_on_gpu = self.on_gpu
     self.cpu()
     eval_data = None
     for batch in validation_step_outputs:
         edp = EvaluationDataPage.create_from_training_batch(batch, self)
         if eval_data is None:
             eval_data = edp
         else:
             eval_data = eval_data.append(edp)
     if eval_data and eval_data.mdp_id is not None:
         eval_data = eval_data.sort()
         eval_data = eval_data.compute_values(self.gamma)
         eval_data.validate()
     if was_on_gpu:
         self.cuda()
     return eval_data
Ejemplo n.º 8
0
    def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
        cpe_details = CpeDetails()

        cpe_details.reward_estimates = self.score_cpe("Reward", edp)

        if (
            self.metrics_to_score is not None
            and edp.logged_metrics is not None
            and self.action_names is not None
        ):
            for i, metric in enumerate(self.metrics_to_score):
                logger.info(
                    "--------- Running CPE on metric: {} ---------".format(metric)
                )

                metric_reward_edp = edp.set_metric_as_reward(i, len(self.action_names))

                cpe_details.metric_estimates[metric] = self.score_cpe(
                    metric, metric_reward_edp
                )

        if self.action_names is not None:
            if edp.optimal_q_values is not None:
                value_means = edp.optimal_q_values.mean(dim=0)
                cpe_details.q_value_means = {
                    action: float(value_means[i])
                    for i, action in enumerate(self.action_names)
                }
                # pyre-ignore [16]: `Optional` has no attribute `std`
                value_stds = edp.optimal_q_values.std(dim=0)
                cpe_details.q_value_stds = {
                    action: float(value_stds[i])
                    for i, action in enumerate(self.action_names)
                }
            if edp.eval_action_idxs is not None:
                cpe_details.action_distribution = {
                    # pyre-ignore [16]: `bool` has no attribute `sum`
                    action: float((edp.eval_action_idxs == i).sum())
                    # pyre-ignore [16]: `Optional` has no attribute `shape`
                    / edp.eval_action_idxs.shape[0]
                    for i, action in enumerate(self.action_names)
                }
        # pyre-fixme[16]: `Evaluator` has no attribute `notify_observers`.
        self.notify_observers(cpe_details=cpe_details)
        return cpe_details
Ejemplo n.º 9
0
def rlestimator_input_to_edp(input: RLEstimatorInput,
                             num_actions: int) -> EvaluationDataPage:
    mdp_ids = []
    logged_propensities = []
    logged_rewards = []
    action_mask = []
    model_propensities = []
    model_values = []

    for _, mdps in input.log.items():
        for mdp in mdps:
            mdp_id = len(mdp_ids)
            for t in mdp:
                mdp_ids.append(mdp_id)
                logged_propensities.append(t.action_prob)
                logged_rewards.append(t.reward)
                assert t.action is not None
                action_mask.append([
                    1 if x == t.action.value else 0 for x in range(num_actions)
                ])
                assert t.last_state is not None
                model_propensities.append([
                    input.target_policy(t.last_state)[Action(x)]
                    for x in range(num_actions)
                ])
                assert input.value_function is not None
                model_values.append([
                    input.value_function(t.last_state, Action(x))
                    for x in range(num_actions)
                ])

    return EvaluationDataPage(
        mdp_id=torch.tensor(mdp_ids).reshape(len(mdp_ids), 1),
        logged_propensities=torch.tensor(logged_propensities).reshape(
            (len(logged_propensities), 1)),
        logged_rewards=torch.tensor(logged_rewards).reshape(
            (len(logged_rewards), 1)),
        action_mask=torch.tensor(action_mask),
        model_propensities=torch.tensor(model_propensities),
        model_values=torch.tensor(model_values),
        sequence_number=torch.tensor([]),
        model_rewards=torch.tensor([]),
        model_rewards_for_logged_action=torch.tensor([]),
    )
Ejemplo n.º 10
0
    def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
        cpe_details = CpeDetails()

        cpe_details.reward_estimates = self.score_cpe("Reward", edp)

        if (self.metrics_to_score is not None
                and edp.logged_metrics is not None
                and self.action_names is not None):
            for i, metric in enumerate(self.metrics_to_score):
                logger.info(
                    "--------- Running CPE on metric: {} ---------".format(
                        metric))

                metric_reward_edp = edp.set_metric_as_reward(
                    i, len(self.action_names))

                cpe_details.metric_estimates[metric] = self.score_cpe(
                    metric, metric_reward_edp)

        if self.action_names is not None:
            if edp.optimal_q_values is not None:
                value_means = edp.optimal_q_values.mean(dim=0)
                cpe_details.q_value_means = {
                    action: float(value_means[i])
                    for i, action in enumerate(self.action_names)
                }
                value_stds = edp.optimal_q_values.std(dim=0)  # type: ignore
                cpe_details.q_value_stds = {
                    action: float(value_stds[i])
                    for i, action in enumerate(self.action_names)
                }
            if edp.eval_action_idxs is not None:
                cpe_details.action_distribution = {
                    action: float(
                        (edp.eval_action_idxs == i).sum())  # type: ignore
                    / edp.eval_action_idxs.shape[0]
                    for i, action in enumerate(self.action_names)
                }
        # Compute MC Loss on Aggregate Reward
        cpe_details.mc_loss = float(
            F.mse_loss(edp.logged_values, edp.model_values_for_logged_action))
        self.notify_observers(cpe_details=cpe_details)  # type: ignore
        return cpe_details
Ejemplo n.º 11
0
def gather_eval_data(
    trainer: RLTrainer,
    eval_dataset: Dataset,
    batch_preprocessor: BatchPreprocessor,
    use_gpu: bool,
    reader_options: ReaderOptions,
) -> EvaluationDataPage:
    """ Sorts, computes logged values and validates the EvaluationDataPage """
    if isinstance(trainer, (SACTrainer, TD3Trainer)):
        raise NotImplementedError("TODO: Implement CPE for continuous algos")
    assert (trainer.calc_cpe_in_training
            ), "this function should only be called when this is true."

    # first read the eval_dataset as EvaluationDataPages
    device = "cuda" if use_gpu else "cpu"
    eval_data = None
    with make_batch_reader(
            # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`.
            # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`.
            eval_dataset.parquet_url,
            num_epochs=1,
            # pyre-fixme[16]: `ReaderOptions` has no attribute `petastorm_reader_pool_type`.
            # pyre-fixme[16]: `ReaderOptions` has no attribute `petastorm_reader_pool_type`.
            reader_pool_type=reader_options.petastorm_reader_pool_type,
    ) as reader:
        for batch in reader:
            assert rlt.isinstance_namedtuple(batch)
            tensor_batch = dict_to_tensor(batch._asdict(), device=device)
            tdp: rlt.PreprocessedTrainingBatch = batch_preprocessor(
                tensor_batch)
            edp = EvaluationDataPage.create_from_training_batch(tdp, trainer)
            if eval_data is None:
                eval_data = edp
            else:
                eval_data = eval_data.append(edp)

    eval_data = eval_data.sort()
    eval_data = eval_data.compute_values(trainer.gamma)
    eval_data.validate()
    return eval_data
Ejemplo n.º 12
0
    def test_seq2slate_eval_data_page(self):
        """
        Create 3 slate ranking logs and evaluate using Direct Method, Inverse
        Propensity Scores, and Doubly Robust.

        The logs are as follows:
        state: [1, 0, 0], [0, 1, 0], [0, 0, 1]
        indices in logged slates: [3, 2], [3, 2], [3, 2]
        model output indices: [2, 3], [3, 2], [2, 3]
        logged reward: 4, 5, 7
        logged propensities: 0.2, 0.5, 0.4
        predicted rewards on logged slates: 2, 4, 6
        predicted rewards on model outputted slates: 1, 4, 5
        predicted propensities: 0.4, 0.3, 0.7

        When eval_greedy=True:

        Direct Method uses the predicted rewards on model outputted slates.
        Thus the result is expected to be (1 + 4 + 5) / 3

        Inverse Propensity Scores would scale the reward by 1.0 / logged propensities
        whenever the model output slate matches with the logged slate.
        Since only the second log matches with the model output, the IPS result
        is expected to be 5 / 0.5 / 3

        Doubly Robust is the sum of the direct method result and propensity-scaled
        reward difference; the latter is defined as:
        1.0 / logged_propensities * (logged reward - predicted reward on logged slate)
         * Indicator(model slate == logged slate)
        Since only the second logged slate matches with the model outputted slate,
        the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3


        When eval_greedy=False:

        Only Inverse Propensity Scores would be accurate. Because it would be too
        expensive to compute all possible slates' propensities and predicted rewards
        for Direct Method.

        The expected IPS = (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3
        """
        batch_size = 3
        state_dim = 3
        src_seq_len = 2
        tgt_seq_len = 2
        candidate_dim = 2

        reward_net = FakeSeq2SlateRewardNetwork()
        seq2slate_net = FakeSeq2SlateTransformerNet()

        src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1)
        tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]])
        tgt_out_seq = src_seq[
            torch.arange(batch_size).repeat_interleave(tgt_seq_len),
            tgt_out_idx.flatten() - 2, ].reshape(batch_size, tgt_seq_len,
                                                 candidate_dim)

        ptb = rlt.PreprocessedTrainingBatch(
            training_input=rlt.PreprocessedRankingInput(
                state=rlt.FeatureData(float_features=torch.eye(state_dim)),
                src_seq=rlt.FeatureData(float_features=src_seq),
                tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq),
                src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len),
                tgt_out_idx=tgt_out_idx,
                tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]),
                slate_reward=torch.tensor([4.0, 5.0, 7.0]),
            ),
            extras=rlt.ExtraData(
                sequence_number=torch.tensor([0, 0, 0]),
                mdp_id=np.array(["0", "1", "2"]),
            ),
        )

        edp = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net, reward_net, ptb.training_input, eval_greedy=True)
        logger.info(
            "---------- Start evaluating eval_greedy=True -----------------")
        doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator())
        dm_estimator = OPEstimatorAdapter(DMEstimator())
        ips_estimator = OPEstimatorAdapter(IPSEstimator())
        switch_estimator = OPEstimatorAdapter(SwitchEstimator())
        switch_dr_estimator = OPEstimatorAdapter(SwitchDREstimator())

        doubly_robust = doubly_robust_estimator.estimate(edp)
        inverse_propensity = ips_estimator.estimate(edp)
        direct_method = dm_estimator.estimate(edp)

        # Verify that Switch with low exponent is equivalent to IPS
        switch_ips = switch_estimator.estimate(edp, exp_base=1)
        # Verify that Switch with no candidates is equivalent to DM
        switch_dm = switch_estimator.estimate(edp, candidates=0)
        # Verify that SwitchDR with low exponent is equivalent to DR
        switch_dr_dr = switch_dr_estimator.estimate(edp, exp_base=1)
        # Verify that SwitchDR with no candidates is equivalent to DM
        switch_dr_dm = switch_dr_estimator.estimate(edp, candidates=0)

        logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}")

        avg_logged_reward = (4 + 5 + 7) / 3
        self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6)
        self.assertAlmostEqual(direct_method.normalized,
                               direct_method.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6)
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        self.assertAlmostEqual(doubly_robust.raw,
                               direct_method.raw + 1 / 0.5 * (5 - 4) / 3,
                               delta=1e-6)
        self.assertAlmostEqual(doubly_robust.normalized,
                               doubly_robust.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(switch_ips.raw,
                               inverse_propensity.raw,
                               delta=1e-6)
        self.assertAlmostEqual(switch_dm.raw, direct_method.raw, delta=1e-6)
        self.assertAlmostEqual(switch_dr_dr.raw, doubly_robust.raw, delta=1e-6)
        self.assertAlmostEqual(switch_dr_dm.raw, direct_method.raw, delta=1e-6)
        logger.info(
            "---------- Finish evaluating eval_greedy=True -----------------")

        logger.info(
            "---------- Start evaluating eval_greedy=False -----------------")
        edp = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net, reward_net, ptb.training_input, eval_greedy=False)
        doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator())
        dm_estimator = OPEstimatorAdapter(DMEstimator())
        ips_estimator = OPEstimatorAdapter(IPSEstimator())

        doubly_robust = doubly_robust_estimator.estimate(edp)
        inverse_propensity = ips_estimator.estimate(edp)
        direct_method = dm_estimator.estimate(edp)
        self.assertAlmostEqual(
            inverse_propensity.raw,
            (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3,
            delta=1e-6,
        )
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        logger.info(
            "---------- Finish evaluating eval_greedy=False -----------------")
Ejemplo n.º 13
0
    def evaluate(self, eval_tdp: PreprocessedTrainingBatch) -> None:
        seq2slate_net = self.trainer.seq2slate_net
        seq2slate_net_prev_mode = seq2slate_net.training
        seq2slate_net.eval()

        logged_slate_rank_prob = torch.exp(
            seq2slate_net(eval_tdp.training_input,
                          mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).log_probs.
            detach().flatten().cpu())

        eval_baseline_loss = torch.tensor([0.0]).reshape(1)
        if self.trainer.baseline_net:
            baseline_net = self.trainer.baseline_net
            # pyre-fixme[16]: `Optional` has no attribute `training`.
            baseline_net_prev_mode = baseline_net.training
            # pyre-fixme[16]: `Optional` has no attribute `eval`.
            baseline_net.eval()
            # pyre-fixme[29]: `Optional[reagent.models.seq2slate.BaselineNet]` is
            #  not a function.
            b = baseline_net(eval_tdp.training_input).detach()
            eval_baseline_loss = (F.mse_loss(
                b, eval_tdp.training_input.slate_reward).cpu().reshape(1))
            # pyre-fixme[16]: `Optional` has no attribute `train`.
            baseline_net.train(baseline_net_prev_mode)
        else:
            b = torch.zeros_like(eval_tdp.training_input.slate_reward)

        eval_advantage = (
            # pyre-fixme[16]: `Optional` has no attribute `__sub__`.
            (eval_tdp.training_input.slate_reward - b).flatten().cpu())

        ranked_slate_output = seq2slate_net(eval_tdp.training_input,
                                            Seq2SlateMode.RANK_MODE,
                                            greedy=True)
        ranked_slate_rank_prob = torch.prod(
            torch.gather(
                ranked_slate_output.ranked_tgt_out_probs,
                2,
                ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1),
            ).squeeze(),
            -1,
        ).cpu()

        seq2slate_net.train(seq2slate_net_prev_mode)

        if not self.calc_cpe:
            return

        edp_g = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            # pyre-fixme[6]: Expected `Module` for 2nd param but got
            #  `Optional[nn.Module]`.
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=True,
        )
        if self.eval_data_pages_g is None:
            self.eval_data_pages_g = edp_g
        else:
            # pyre-fixme[16]: `Optional` has no attribute `append`.
            self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g)

        edp_ng = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            # pyre-fixme[6]: Expected `Module` for 2nd param but got
            #  `Optional[nn.Module]`.
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=False,
        )
        if self.eval_data_pages_ng is None:
            self.eval_data_pages_ng = edp_ng
        else:
            self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)

        # pyre-fixme[16]: `RankingPolicyGradientEvaluator` has no attribute
        #  `notify_observers`.
        self.notify_observers(
            eval_baseline_loss=eval_baseline_loss,
            eval_advantages=eval_advantage,
            logged_slate_rank_probs=logged_slate_rank_prob,
            ranked_slate_rank_probs=ranked_slate_rank_prob,
        )
    def evaluate(self, eval_tdp: PreprocessedTrainingBatch) -> None:
        seq2slate_net = self.trainer.seq2slate_net
        seq2slate_net_prev_mode = seq2slate_net.training
        seq2slate_net.eval()

        logged_slate_log_prob = (seq2slate_net(
            eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).
                                 log_probs.detach().flatten().cpu().numpy())

        if self.trainer.baseline_net:
            baseline_net = self.trainer.baseline_net
            # pyre-fixme[16]: `Optional` has no attribute `training`.
            baseline_net_prev_mode = baseline_net.training
            # pyre-fixme[16]: `Optional` has no attribute `eval`.
            baseline_net.eval()
            # pyre-fixme[29]: `Optional[reagent.models.seq2slate.BaselineNet]` is
            #  not a function.
            b = baseline_net(eval_tdp.training_input).detach()
            self.baseline_loss.append(
                F.mse_loss(b, eval_tdp.training_input.slate_reward).item())
            # pyre-fixme[16]: `Optional` has no attribute `train`.
            baseline_net.train(baseline_net_prev_mode)
        else:
            b = torch.zeros_like(eval_tdp.training_input.slate_reward)
            self.baseline_loss.append(0.0)

        advantage = (eval_tdp.training_input.slate_reward -
                     b).flatten().cpu().numpy()
        self.advantages.append(advantage)
        self.logged_slate_log_probs.append(logged_slate_log_prob)

        ranked_slate_output = seq2slate_net(eval_tdp.training_input,
                                            Seq2SlateMode.RANK_MODE,
                                            greedy=True)
        ranked_slate_prob = (torch.prod(
            torch.gather(
                ranked_slate_output.ranked_tgt_out_probs,
                2,
                ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1),
            ).squeeze(),
            -1,
        ).cpu().numpy())
        self.ranked_slate_probs.append(ranked_slate_prob)

        seq2slate_net.train(seq2slate_net_prev_mode)

        if not self.calc_cpe:
            return

        edp_g = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            # pyre-fixme[6]: Expected `Module` for 2nd param but got
            #  `Optional[nn.Module]`.
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=True,
        )
        if self.eval_data_pages_g is None:
            self.eval_data_pages_g = edp_g
        else:
            # pyre-fixme[16]: `Optional` has no attribute `append`.
            self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g)

        edp_ng = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            # pyre-fixme[6]: Expected `Module` for 2nd param but got
            #  `Optional[nn.Module]`.
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=False,
        )
        if self.eval_data_pages_ng is None:
            self.eval_data_pages_ng = edp_ng
        else:
            self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)
    def evaluate(self, eval_tdp: PreprocessedRankingInput) -> None:
        seq2slate_net = self.trainer.seq2slate_net
        seq2slate_net_prev_mode = seq2slate_net.training
        seq2slate_net.eval()

        logged_slate_rank_prob = torch.exp(
            seq2slate_net(eval_tdp, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).
            log_probs.detach().flatten().cpu())

        eval_baseline_loss = torch.tensor([0.0]).reshape(1)
        if self.trainer.baseline_net:
            baseline_net = self.trainer.baseline_net
            # pyre-fixme[16]: `Optional` has no attribute `training`.
            baseline_net_prev_mode = baseline_net.training
            # pyre-fixme[16]: `Optional` has no attribute `eval`.
            baseline_net.eval()
            # pyre-fixme[29]: `Optional[reagent.models.seq2slate.BaselineNet]` is
            #  not a function.
            b = baseline_net(eval_tdp).detach()
            eval_baseline_loss = F.mse_loss(
                b, eval_tdp.slate_reward).cpu().reshape(1)
            # pyre-fixme[16]: `Optional` has no attribute `train`.
            baseline_net.train(baseline_net_prev_mode)
        else:
            b = torch.zeros_like(eval_tdp.slate_reward)

        eval_advantage = (
            # pyre-fixme[58]: `-` is not supported for operand types
            #  `Optional[torch.Tensor]` and `Any`.
            (eval_tdp.slate_reward - b).flatten().cpu())

        ranked_slate_output = seq2slate_net(eval_tdp,
                                            Seq2SlateMode.RANK_MODE,
                                            greedy=True)
        ranked_slate_rank_prob = ranked_slate_output.ranked_per_seq_probs.cpu()

        seq2slate_net.train(seq2slate_net_prev_mode)

        if not self.calc_cpe:
            return

        edp_g = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            eval_tdp,
            eval_greedy=True,
        )
        if self.eval_data_pages_g is None:
            self.eval_data_pages_g = edp_g
        else:
            # pyre-fixme[16]: `Optional` has no attribute `append`.
            self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g)

        edp_ng = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            eval_tdp,
            eval_greedy=False,
        )
        if self.eval_data_pages_ng is None:
            self.eval_data_pages_ng = edp_ng
        else:
            self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)

        # pyre-fixme[16]: `RankingPolicyGradientEvaluator` has no attribute
        #  `notify_observers`.
        self.notify_observers(
            eval_baseline_loss=eval_baseline_loss,
            eval_advantages=eval_advantage,
            logged_slate_rank_probs=logged_slate_rank_prob,
            ranked_slate_rank_probs=ranked_slate_rank_prob,
        )
Ejemplo n.º 16
0
    def evaluate(self, eval_tdp: PreprocessedTrainingBatch) -> None:
        seq2slate_net = self.trainer.seq2slate_net
        seq2slate_net_prev_mode = seq2slate_net.training
        seq2slate_net.eval()

        logged_slate_log_prob = (seq2slate_net(
            eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).
                                 log_probs.detach().flatten().cpu().numpy())

        if self.trainer.baseline_net:
            baseline_net = self.trainer.baseline_net
            baseline_net_prev_mode = baseline_net.training
            baseline_net.eval()
            b = baseline_net(eval_tdp.training_input).detach()
            self.baseline_loss.append(
                F.mse_loss(b, eval_tdp.training_input.slate_reward).item())
            baseline_net.train(baseline_net_prev_mode)
        else:
            b = torch.zeros_like(eval_tdp.training_input.slate_reward)
            self.baseline_loss.append(0.0)

        advantage = (eval_tdp.training_input.slate_reward -
                     b).flatten().cpu().numpy()
        self.advantages.append(advantage)
        self.logged_slate_log_probs.append(logged_slate_log_prob)

        ranked_slate_output = seq2slate_net(eval_tdp.training_input,
                                            Seq2SlateMode.RANK_MODE,
                                            greedy=True)
        ranked_slate_prob = (torch.prod(
            torch.gather(
                ranked_slate_output.ranked_tgt_out_probs,
                2,
                ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1),
            ).squeeze(),
            -1,
        ).cpu().numpy())
        self.ranked_slate_probs.append(ranked_slate_prob)

        seq2slate_net.train(seq2slate_net_prev_mode)

        if not self.calc_cpe:
            return

        edp_g = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=True,
        )
        if self.eval_data_pages_g is None:
            self.eval_data_pages_g = edp_g
        else:
            self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g)

        edp_ng = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=False,
        )
        if self.eval_data_pages_ng is None:
            self.eval_data_pages_ng = edp_ng
        else:
            self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)
Ejemplo n.º 17
0
 def validation_step(self, batch, batch_idx):
     # HACK: Move to cpu in order to hold more batches in memory
     # This is only needed when trainers need in-memory
     # EvaluationDataPages of the full evaluation dataset
     return EvaluationDataPage.create_from_training_batch(batch, self).cpu()