def validation_step(self, batch, batch_idx): if isinstance(batch, dict): batch = rlt.DiscreteDqnInput.from_dict(batch) # HACK: Move to cpu in order to hold more batches in memory # This is only needed when trainers need in-memory # EvaluationDataPages of the full evaluation dataset return EvaluationDataPage.create_from_training_batch(batch, self).cpu()
def create_edp(self, environment, samples, epsilon_model): """Generate a EvaluationDataPage such that the model policy is epsilon greedy with parameter epsilon_model. The true values of this policy are used for the model_values* data. """ tdp = environment.preprocess_samples(samples, len(samples.mdp_ids), do_shuffle=False)[0] # compute rewards, probs, values for all actions of each sampled state model_rewards = environment.true_rewards_all_actions_for_sample( samples.states) model_propensities = environment.policy_probabilities_for_sample( samples.states, epsilon_model) model_values = environment.true_epsilon_values_all_actions_for_sample( samples.states, epsilon_model) # compute rewards for logged action model_rewards_logged_action = environment.true_rewards_for_sample( samples.states, samples.actions) edp = EvaluationDataPage( mdp_id=np.array(samples.mdp_ids).reshape(-1, 1), sequence_number=torch.tensor(samples.sequence_numbers, dtype=torch.int), logged_propensities=tdp.propensities, logged_rewards=tdp.rewards, action_mask=tdp.actions, model_propensities=torch.tensor(model_propensities, dtype=torch.float32), model_rewards=torch.tensor(model_rewards, dtype=torch.float32), model_rewards_for_logged_action=torch.tensor( model_rewards_logged_action, dtype=torch.float32), model_values=torch.tensor(model_values, dtype=torch.float32), model_values_for_logged_action=None, possible_actions_mask=tdp.possible_actions_mask, ) return edp
def gather_eval_data(self, test_step_outputs): eval_data = None for batch in test_step_outputs: edp = EvaluationDataPage.create_from_training_batch(batch, self) if eval_data is None: eval_data = edp else: eval_data = eval_data.append(edp) if eval_data.mdp_id is not None: eval_data = eval_data.sort() eval_data = eval_data.compute_values(self.gamma) eval_data.validate() return eval_data
def validation_step(self, batch: rlt.PreprocessedRankingInput, batch_idx: int): seq2slate_net = self.seq2slate_net assert seq2slate_net.training is False logged_slate_rank_prob = torch.exp( seq2slate_net(batch, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu()) ranked_slate_output = seq2slate_net(batch, Seq2SlateMode.RANK_MODE, greedy=True) ranked_slate_rank_prob = ranked_slate_output.ranked_per_seq_probs.cpu() self.reporter.log( logged_slate_rank_probs=logged_slate_rank_prob, ranked_slate_rank_probs=ranked_slate_rank_prob, ) if not self.calc_cpe: return edp_g = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, batch, eval_greedy=True, ) edp_ng = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, batch, eval_greedy=False, ) return edp_g, edp_ng
def handle(self, tdp: PreprocessedTrainingBatch) -> None: if not self.trainer.calc_cpe_in_training: return # TODO: Perhaps we can make an RLTrainer param to check if continuous? if isinstance(self.trainer, (SACTrainer, TD3Trainer)): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails: cpe_details = CpeDetails() cpe_details.reward_estimates = self.score_cpe("Reward", edp) if ( self.metrics_to_score is not None and edp.logged_metrics is not None and self.action_names is not None ): for i, metric in enumerate(self.metrics_to_score): logger.info( "--------- Running CPE on metric: {} ---------".format(metric) ) metric_reward_edp = edp.set_metric_as_reward(i, len(self.action_names)) cpe_details.metric_estimates[metric] = self.score_cpe( metric, metric_reward_edp ) if self.action_names is not None: if edp.optimal_q_values is not None: value_means = edp.optimal_q_values.mean(dim=0) cpe_details.q_value_means = { action: float(value_means[i]) for i, action in enumerate(self.action_names) } value_stds = edp.optimal_q_values.std(dim=0) cpe_details.q_value_stds = { action: float(value_stds[i]) for i, action in enumerate(self.action_names) } if edp.eval_action_idxs is not None: cpe_details.action_distribution = { # pyre-fixme[6]: Expected `Union[_SupportsIndex, bytearray, # bytes, str, typing.SupportsFloat]` for 1st param but got # `ByteTensor`. action: float((edp.eval_action_idxs == i).sum()) / edp.eval_action_idxs.shape[0] for i, action in enumerate(self.action_names) } # Compute MC Loss on Aggregate Reward cpe_details.mc_loss = float( F.mse_loss(edp.logged_values, edp.model_values_for_logged_action) ) # pyre-fixme[16]: `Evaluator` has no attribute `notify_observers`. self.notify_observers(cpe_details=cpe_details) return cpe_details
def gather_eval_data(self, validation_step_outputs): was_on_gpu = self.on_gpu self.cpu() eval_data = None for batch in validation_step_outputs: edp = EvaluationDataPage.create_from_training_batch(batch, self) if eval_data is None: eval_data = edp else: eval_data = eval_data.append(edp) if eval_data and eval_data.mdp_id is not None: eval_data = eval_data.sort() eval_data = eval_data.compute_values(self.gamma) eval_data.validate() if was_on_gpu: self.cuda() return eval_data
def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails: cpe_details = CpeDetails() cpe_details.reward_estimates = self.score_cpe("Reward", edp) if ( self.metrics_to_score is not None and edp.logged_metrics is not None and self.action_names is not None ): for i, metric in enumerate(self.metrics_to_score): logger.info( "--------- Running CPE on metric: {} ---------".format(metric) ) metric_reward_edp = edp.set_metric_as_reward(i, len(self.action_names)) cpe_details.metric_estimates[metric] = self.score_cpe( metric, metric_reward_edp ) if self.action_names is not None: if edp.optimal_q_values is not None: value_means = edp.optimal_q_values.mean(dim=0) cpe_details.q_value_means = { action: float(value_means[i]) for i, action in enumerate(self.action_names) } # pyre-ignore [16]: `Optional` has no attribute `std` value_stds = edp.optimal_q_values.std(dim=0) cpe_details.q_value_stds = { action: float(value_stds[i]) for i, action in enumerate(self.action_names) } if edp.eval_action_idxs is not None: cpe_details.action_distribution = { # pyre-ignore [16]: `bool` has no attribute `sum` action: float((edp.eval_action_idxs == i).sum()) # pyre-ignore [16]: `Optional` has no attribute `shape` / edp.eval_action_idxs.shape[0] for i, action in enumerate(self.action_names) } # pyre-fixme[16]: `Evaluator` has no attribute `notify_observers`. self.notify_observers(cpe_details=cpe_details) return cpe_details
def rlestimator_input_to_edp(input: RLEstimatorInput, num_actions: int) -> EvaluationDataPage: mdp_ids = [] logged_propensities = [] logged_rewards = [] action_mask = [] model_propensities = [] model_values = [] for _, mdps in input.log.items(): for mdp in mdps: mdp_id = len(mdp_ids) for t in mdp: mdp_ids.append(mdp_id) logged_propensities.append(t.action_prob) logged_rewards.append(t.reward) assert t.action is not None action_mask.append([ 1 if x == t.action.value else 0 for x in range(num_actions) ]) assert t.last_state is not None model_propensities.append([ input.target_policy(t.last_state)[Action(x)] for x in range(num_actions) ]) assert input.value_function is not None model_values.append([ input.value_function(t.last_state, Action(x)) for x in range(num_actions) ]) return EvaluationDataPage( mdp_id=torch.tensor(mdp_ids).reshape(len(mdp_ids), 1), logged_propensities=torch.tensor(logged_propensities).reshape( (len(logged_propensities), 1)), logged_rewards=torch.tensor(logged_rewards).reshape( (len(logged_rewards), 1)), action_mask=torch.tensor(action_mask), model_propensities=torch.tensor(model_propensities), model_values=torch.tensor(model_values), sequence_number=torch.tensor([]), model_rewards=torch.tensor([]), model_rewards_for_logged_action=torch.tensor([]), )
def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails: cpe_details = CpeDetails() cpe_details.reward_estimates = self.score_cpe("Reward", edp) if (self.metrics_to_score is not None and edp.logged_metrics is not None and self.action_names is not None): for i, metric in enumerate(self.metrics_to_score): logger.info( "--------- Running CPE on metric: {} ---------".format( metric)) metric_reward_edp = edp.set_metric_as_reward( i, len(self.action_names)) cpe_details.metric_estimates[metric] = self.score_cpe( metric, metric_reward_edp) if self.action_names is not None: if edp.optimal_q_values is not None: value_means = edp.optimal_q_values.mean(dim=0) cpe_details.q_value_means = { action: float(value_means[i]) for i, action in enumerate(self.action_names) } value_stds = edp.optimal_q_values.std(dim=0) # type: ignore cpe_details.q_value_stds = { action: float(value_stds[i]) for i, action in enumerate(self.action_names) } if edp.eval_action_idxs is not None: cpe_details.action_distribution = { action: float( (edp.eval_action_idxs == i).sum()) # type: ignore / edp.eval_action_idxs.shape[0] for i, action in enumerate(self.action_names) } # Compute MC Loss on Aggregate Reward cpe_details.mc_loss = float( F.mse_loss(edp.logged_values, edp.model_values_for_logged_action)) self.notify_observers(cpe_details=cpe_details) # type: ignore return cpe_details
def gather_eval_data( trainer: RLTrainer, eval_dataset: Dataset, batch_preprocessor: BatchPreprocessor, use_gpu: bool, reader_options: ReaderOptions, ) -> EvaluationDataPage: """ Sorts, computes logged values and validates the EvaluationDataPage """ if isinstance(trainer, (SACTrainer, TD3Trainer)): raise NotImplementedError("TODO: Implement CPE for continuous algos") assert (trainer.calc_cpe_in_training ), "this function should only be called when this is true." # first read the eval_dataset as EvaluationDataPages device = "cuda" if use_gpu else "cpu" eval_data = None with make_batch_reader( # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`. # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`. eval_dataset.parquet_url, num_epochs=1, # pyre-fixme[16]: `ReaderOptions` has no attribute `petastorm_reader_pool_type`. # pyre-fixme[16]: `ReaderOptions` has no attribute `petastorm_reader_pool_type`. reader_pool_type=reader_options.petastorm_reader_pool_type, ) as reader: for batch in reader: assert rlt.isinstance_namedtuple(batch) tensor_batch = dict_to_tensor(batch._asdict(), device=device) tdp: rlt.PreprocessedTrainingBatch = batch_preprocessor( tensor_batch) edp = EvaluationDataPage.create_from_training_batch(tdp, trainer) if eval_data is None: eval_data = edp else: eval_data = eval_data.append(edp) eval_data = eval_data.sort() eval_data = eval_data.compute_values(trainer.gamma) eval_data.validate() return eval_data
def test_seq2slate_eval_data_page(self): """ Create 3 slate ranking logs and evaluate using Direct Method, Inverse Propensity Scores, and Doubly Robust. The logs are as follows: state: [1, 0, 0], [0, 1, 0], [0, 0, 1] indices in logged slates: [3, 2], [3, 2], [3, 2] model output indices: [2, 3], [3, 2], [2, 3] logged reward: 4, 5, 7 logged propensities: 0.2, 0.5, 0.4 predicted rewards on logged slates: 2, 4, 6 predicted rewards on model outputted slates: 1, 4, 5 predicted propensities: 0.4, 0.3, 0.7 When eval_greedy=True: Direct Method uses the predicted rewards on model outputted slates. Thus the result is expected to be (1 + 4 + 5) / 3 Inverse Propensity Scores would scale the reward by 1.0 / logged propensities whenever the model output slate matches with the logged slate. Since only the second log matches with the model output, the IPS result is expected to be 5 / 0.5 / 3 Doubly Robust is the sum of the direct method result and propensity-scaled reward difference; the latter is defined as: 1.0 / logged_propensities * (logged reward - predicted reward on logged slate) * Indicator(model slate == logged slate) Since only the second logged slate matches with the model outputted slate, the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3 When eval_greedy=False: Only Inverse Propensity Scores would be accurate. Because it would be too expensive to compute all possible slates' propensities and predicted rewards for Direct Method. The expected IPS = (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3 """ batch_size = 3 state_dim = 3 src_seq_len = 2 tgt_seq_len = 2 candidate_dim = 2 reward_net = FakeSeq2SlateRewardNetwork() seq2slate_net = FakeSeq2SlateTransformerNet() src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1) tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]]) tgt_out_seq = src_seq[ torch.arange(batch_size).repeat_interleave(tgt_seq_len), tgt_out_idx.flatten() - 2, ].reshape(batch_size, tgt_seq_len, candidate_dim) ptb = rlt.PreprocessedTrainingBatch( training_input=rlt.PreprocessedRankingInput( state=rlt.FeatureData(float_features=torch.eye(state_dim)), src_seq=rlt.FeatureData(float_features=src_seq), tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq), src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len), tgt_out_idx=tgt_out_idx, tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]), slate_reward=torch.tensor([4.0, 5.0, 7.0]), ), extras=rlt.ExtraData( sequence_number=torch.tensor([0, 0, 0]), mdp_id=np.array(["0", "1", "2"]), ), ) edp = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, reward_net, ptb.training_input, eval_greedy=True) logger.info( "---------- Start evaluating eval_greedy=True -----------------") doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator()) dm_estimator = OPEstimatorAdapter(DMEstimator()) ips_estimator = OPEstimatorAdapter(IPSEstimator()) switch_estimator = OPEstimatorAdapter(SwitchEstimator()) switch_dr_estimator = OPEstimatorAdapter(SwitchDREstimator()) doubly_robust = doubly_robust_estimator.estimate(edp) inverse_propensity = ips_estimator.estimate(edp) direct_method = dm_estimator.estimate(edp) # Verify that Switch with low exponent is equivalent to IPS switch_ips = switch_estimator.estimate(edp, exp_base=1) # Verify that Switch with no candidates is equivalent to DM switch_dm = switch_estimator.estimate(edp, candidates=0) # Verify that SwitchDR with low exponent is equivalent to DR switch_dr_dr = switch_dr_estimator.estimate(edp, exp_base=1) # Verify that SwitchDR with no candidates is equivalent to DM switch_dr_dm = switch_dr_estimator.estimate(edp, candidates=0) logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}") avg_logged_reward = (4 + 5 + 7) / 3 self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6) self.assertAlmostEqual(direct_method.normalized, direct_method.raw / avg_logged_reward, delta=1e-6) self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6) self.assertAlmostEqual( inverse_propensity.normalized, inverse_propensity.raw / avg_logged_reward, delta=1e-6, ) self.assertAlmostEqual(doubly_robust.raw, direct_method.raw + 1 / 0.5 * (5 - 4) / 3, delta=1e-6) self.assertAlmostEqual(doubly_robust.normalized, doubly_robust.raw / avg_logged_reward, delta=1e-6) self.assertAlmostEqual(switch_ips.raw, inverse_propensity.raw, delta=1e-6) self.assertAlmostEqual(switch_dm.raw, direct_method.raw, delta=1e-6) self.assertAlmostEqual(switch_dr_dr.raw, doubly_robust.raw, delta=1e-6) self.assertAlmostEqual(switch_dr_dm.raw, direct_method.raw, delta=1e-6) logger.info( "---------- Finish evaluating eval_greedy=True -----------------") logger.info( "---------- Start evaluating eval_greedy=False -----------------") edp = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, reward_net, ptb.training_input, eval_greedy=False) doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator()) dm_estimator = OPEstimatorAdapter(DMEstimator()) ips_estimator = OPEstimatorAdapter(IPSEstimator()) doubly_robust = doubly_robust_estimator.estimate(edp) inverse_propensity = ips_estimator.estimate(edp) direct_method = dm_estimator.estimate(edp) self.assertAlmostEqual( inverse_propensity.raw, (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3, delta=1e-6, ) self.assertAlmostEqual( inverse_propensity.normalized, inverse_propensity.raw / avg_logged_reward, delta=1e-6, ) logger.info( "---------- Finish evaluating eval_greedy=False -----------------")
def evaluate(self, eval_tdp: PreprocessedTrainingBatch) -> None: seq2slate_net = self.trainer.seq2slate_net seq2slate_net_prev_mode = seq2slate_net.training seq2slate_net.eval() logged_slate_rank_prob = torch.exp( seq2slate_net(eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).log_probs. detach().flatten().cpu()) eval_baseline_loss = torch.tensor([0.0]).reshape(1) if self.trainer.baseline_net: baseline_net = self.trainer.baseline_net # pyre-fixme[16]: `Optional` has no attribute `training`. baseline_net_prev_mode = baseline_net.training # pyre-fixme[16]: `Optional` has no attribute `eval`. baseline_net.eval() # pyre-fixme[29]: `Optional[reagent.models.seq2slate.BaselineNet]` is # not a function. b = baseline_net(eval_tdp.training_input).detach() eval_baseline_loss = (F.mse_loss( b, eval_tdp.training_input.slate_reward).cpu().reshape(1)) # pyre-fixme[16]: `Optional` has no attribute `train`. baseline_net.train(baseline_net_prev_mode) else: b = torch.zeros_like(eval_tdp.training_input.slate_reward) eval_advantage = ( # pyre-fixme[16]: `Optional` has no attribute `__sub__`. (eval_tdp.training_input.slate_reward - b).flatten().cpu()) ranked_slate_output = seq2slate_net(eval_tdp.training_input, Seq2SlateMode.RANK_MODE, greedy=True) ranked_slate_rank_prob = torch.prod( torch.gather( ranked_slate_output.ranked_tgt_out_probs, 2, ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1), ).squeeze(), -1, ).cpu() seq2slate_net.train(seq2slate_net_prev_mode) if not self.calc_cpe: return edp_g = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, # pyre-fixme[6]: Expected `Module` for 2nd param but got # `Optional[nn.Module]`. self.reward_network, eval_tdp.training_input, eval_greedy=True, ) if self.eval_data_pages_g is None: self.eval_data_pages_g = edp_g else: # pyre-fixme[16]: `Optional` has no attribute `append`. self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g) edp_ng = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, # pyre-fixme[6]: Expected `Module` for 2nd param but got # `Optional[nn.Module]`. self.reward_network, eval_tdp.training_input, eval_greedy=False, ) if self.eval_data_pages_ng is None: self.eval_data_pages_ng = edp_ng else: self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng) # pyre-fixme[16]: `RankingPolicyGradientEvaluator` has no attribute # `notify_observers`. self.notify_observers( eval_baseline_loss=eval_baseline_loss, eval_advantages=eval_advantage, logged_slate_rank_probs=logged_slate_rank_prob, ranked_slate_rank_probs=ranked_slate_rank_prob, )
def evaluate(self, eval_tdp: PreprocessedTrainingBatch) -> None: seq2slate_net = self.trainer.seq2slate_net seq2slate_net_prev_mode = seq2slate_net.training seq2slate_net.eval() logged_slate_log_prob = (seq2slate_net( eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu().numpy()) if self.trainer.baseline_net: baseline_net = self.trainer.baseline_net # pyre-fixme[16]: `Optional` has no attribute `training`. baseline_net_prev_mode = baseline_net.training # pyre-fixme[16]: `Optional` has no attribute `eval`. baseline_net.eval() # pyre-fixme[29]: `Optional[reagent.models.seq2slate.BaselineNet]` is # not a function. b = baseline_net(eval_tdp.training_input).detach() self.baseline_loss.append( F.mse_loss(b, eval_tdp.training_input.slate_reward).item()) # pyre-fixme[16]: `Optional` has no attribute `train`. baseline_net.train(baseline_net_prev_mode) else: b = torch.zeros_like(eval_tdp.training_input.slate_reward) self.baseline_loss.append(0.0) advantage = (eval_tdp.training_input.slate_reward - b).flatten().cpu().numpy() self.advantages.append(advantage) self.logged_slate_log_probs.append(logged_slate_log_prob) ranked_slate_output = seq2slate_net(eval_tdp.training_input, Seq2SlateMode.RANK_MODE, greedy=True) ranked_slate_prob = (torch.prod( torch.gather( ranked_slate_output.ranked_tgt_out_probs, 2, ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1), ).squeeze(), -1, ).cpu().numpy()) self.ranked_slate_probs.append(ranked_slate_prob) seq2slate_net.train(seq2slate_net_prev_mode) if not self.calc_cpe: return edp_g = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, # pyre-fixme[6]: Expected `Module` for 2nd param but got # `Optional[nn.Module]`. self.reward_network, eval_tdp.training_input, eval_greedy=True, ) if self.eval_data_pages_g is None: self.eval_data_pages_g = edp_g else: # pyre-fixme[16]: `Optional` has no attribute `append`. self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g) edp_ng = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, # pyre-fixme[6]: Expected `Module` for 2nd param but got # `Optional[nn.Module]`. self.reward_network, eval_tdp.training_input, eval_greedy=False, ) if self.eval_data_pages_ng is None: self.eval_data_pages_ng = edp_ng else: self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)
def evaluate(self, eval_tdp: PreprocessedRankingInput) -> None: seq2slate_net = self.trainer.seq2slate_net seq2slate_net_prev_mode = seq2slate_net.training seq2slate_net.eval() logged_slate_rank_prob = torch.exp( seq2slate_net(eval_tdp, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu()) eval_baseline_loss = torch.tensor([0.0]).reshape(1) if self.trainer.baseline_net: baseline_net = self.trainer.baseline_net # pyre-fixme[16]: `Optional` has no attribute `training`. baseline_net_prev_mode = baseline_net.training # pyre-fixme[16]: `Optional` has no attribute `eval`. baseline_net.eval() # pyre-fixme[29]: `Optional[reagent.models.seq2slate.BaselineNet]` is # not a function. b = baseline_net(eval_tdp).detach() eval_baseline_loss = F.mse_loss( b, eval_tdp.slate_reward).cpu().reshape(1) # pyre-fixme[16]: `Optional` has no attribute `train`. baseline_net.train(baseline_net_prev_mode) else: b = torch.zeros_like(eval_tdp.slate_reward) eval_advantage = ( # pyre-fixme[58]: `-` is not supported for operand types # `Optional[torch.Tensor]` and `Any`. (eval_tdp.slate_reward - b).flatten().cpu()) ranked_slate_output = seq2slate_net(eval_tdp, Seq2SlateMode.RANK_MODE, greedy=True) ranked_slate_rank_prob = ranked_slate_output.ranked_per_seq_probs.cpu() seq2slate_net.train(seq2slate_net_prev_mode) if not self.calc_cpe: return edp_g = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, eval_tdp, eval_greedy=True, ) if self.eval_data_pages_g is None: self.eval_data_pages_g = edp_g else: # pyre-fixme[16]: `Optional` has no attribute `append`. self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g) edp_ng = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, eval_tdp, eval_greedy=False, ) if self.eval_data_pages_ng is None: self.eval_data_pages_ng = edp_ng else: self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng) # pyre-fixme[16]: `RankingPolicyGradientEvaluator` has no attribute # `notify_observers`. self.notify_observers( eval_baseline_loss=eval_baseline_loss, eval_advantages=eval_advantage, logged_slate_rank_probs=logged_slate_rank_prob, ranked_slate_rank_probs=ranked_slate_rank_prob, )
def evaluate(self, eval_tdp: PreprocessedTrainingBatch) -> None: seq2slate_net = self.trainer.seq2slate_net seq2slate_net_prev_mode = seq2slate_net.training seq2slate_net.eval() logged_slate_log_prob = (seq2slate_net( eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu().numpy()) if self.trainer.baseline_net: baseline_net = self.trainer.baseline_net baseline_net_prev_mode = baseline_net.training baseline_net.eval() b = baseline_net(eval_tdp.training_input).detach() self.baseline_loss.append( F.mse_loss(b, eval_tdp.training_input.slate_reward).item()) baseline_net.train(baseline_net_prev_mode) else: b = torch.zeros_like(eval_tdp.training_input.slate_reward) self.baseline_loss.append(0.0) advantage = (eval_tdp.training_input.slate_reward - b).flatten().cpu().numpy() self.advantages.append(advantage) self.logged_slate_log_probs.append(logged_slate_log_prob) ranked_slate_output = seq2slate_net(eval_tdp.training_input, Seq2SlateMode.RANK_MODE, greedy=True) ranked_slate_prob = (torch.prod( torch.gather( ranked_slate_output.ranked_tgt_out_probs, 2, ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1), ).squeeze(), -1, ).cpu().numpy()) self.ranked_slate_probs.append(ranked_slate_prob) seq2slate_net.train(seq2slate_net_prev_mode) if not self.calc_cpe: return edp_g = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, eval_tdp.training_input, eval_greedy=True, ) if self.eval_data_pages_g is None: self.eval_data_pages_g = edp_g else: self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g) edp_ng = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, eval_tdp.training_input, eval_greedy=False, ) if self.eval_data_pages_ng is None: self.eval_data_pages_ng = edp_ng else: self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)
def validation_step(self, batch, batch_idx): # HACK: Move to cpu in order to hold more batches in memory # This is only needed when trainers need in-memory # EvaluationDataPages of the full evaluation dataset return EvaluationDataPage.create_from_training_batch(batch, self).cpu()