def forward( self, state: torch.Tensor, src_seq: torch.Tensor, tgt_out_seq: torch.Tensor, src_src_mask: torch.Tensor, tgt_out_idx: torch.Tensor, ) -> torch.Tensor: return self.model( rlt.PreprocessedRankingInput( state=rlt.FeatureData(float_features=state), src_seq=rlt.FeatureData(float_features=src_seq), tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq), src_src_mask=src_src_mask, tgt_out_idx=tgt_out_idx, )).predicted_reward
def test_seq2slate_eval_data_page(self): """ Create 3 slate ranking logs and evaluate using Direct Method, Inverse Propensity Scores, and Doubly Robust. The logs are as follows: state: [1, 0, 0], [0, 1, 0], [0, 0, 1] indices in logged slates: [3, 2], [3, 2], [3, 2] model output indices: [2, 3], [3, 2], [2, 3] logged reward: 4, 5, 7 logged propensities: 0.2, 0.5, 0.4 predicted rewards on logged slates: 2, 4, 6 predicted rewards on model outputted slates: 1, 4, 5 predicted propensities: 0.4, 0.3, 0.7 When eval_greedy=True: Direct Method uses the predicted rewards on model outputted slates. Thus the result is expected to be (1 + 4 + 5) / 3 Inverse Propensity Scores would scale the reward by 1.0 / logged propensities whenever the model output slate matches with the logged slate. Since only the second log matches with the model output, the IPS result is expected to be 5 / 0.5 / 3 Doubly Robust is the sum of the direct method result and propensity-scaled reward difference; the latter is defined as: 1.0 / logged_propensities * (logged reward - predicted reward on logged slate) * Indicator(model slate == logged slate) Since only the second logged slate matches with the model outputted slate, the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3 When eval_greedy=False: Only Inverse Propensity Scores would be accurate. Because it would be too expensive to compute all possible slates' propensities and predicted rewards for Direct Method. The expected IPS = (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3 """ batch_size = 3 state_dim = 3 src_seq_len = 2 tgt_seq_len = 2 candidate_dim = 2 reward_net = FakeSeq2SlateRewardNetwork() seq2slate_net = FakeSeq2SlateTransformerNet() src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1) tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]]) tgt_out_seq = src_seq[ torch.arange(batch_size).repeat_interleave(tgt_seq_len), tgt_out_idx.flatten() - 2, ].reshape(batch_size, tgt_seq_len, candidate_dim) ptb = rlt.PreprocessedTrainingBatch( training_input=rlt.PreprocessedRankingInput( state=rlt.FeatureData(float_features=torch.eye(state_dim)), src_seq=rlt.FeatureData(float_features=src_seq), tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq), src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len), tgt_out_idx=tgt_out_idx, tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]), slate_reward=torch.tensor([4.0, 5.0, 7.0]), ), extras=rlt.ExtraData( sequence_number=torch.tensor([0, 0, 0]), mdp_id=np.array(["0", "1", "2"]), ), ) edp = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, reward_net, ptb.training_input, eval_greedy=True) logger.info( "---------- Start evaluating eval_greedy=True -----------------") doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator()) dm_estimator = OPEstimatorAdapter(DMEstimator()) ips_estimator = OPEstimatorAdapter(IPSEstimator()) switch_estimator = OPEstimatorAdapter(SwitchEstimator()) switch_dr_estimator = OPEstimatorAdapter(SwitchDREstimator()) doubly_robust = doubly_robust_estimator.estimate(edp) inverse_propensity = ips_estimator.estimate(edp) direct_method = dm_estimator.estimate(edp) # Verify that Switch with low exponent is equivalent to IPS switch_ips = switch_estimator.estimate(edp, exp_base=1) # Verify that Switch with no candidates is equivalent to DM switch_dm = switch_estimator.estimate(edp, candidates=0) # Verify that SwitchDR with low exponent is equivalent to DR switch_dr_dr = switch_dr_estimator.estimate(edp, exp_base=1) # Verify that SwitchDR with no candidates is equivalent to DM switch_dr_dm = switch_dr_estimator.estimate(edp, candidates=0) logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}") avg_logged_reward = (4 + 5 + 7) / 3 self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6) self.assertAlmostEqual(direct_method.normalized, direct_method.raw / avg_logged_reward, delta=1e-6) self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6) self.assertAlmostEqual( inverse_propensity.normalized, inverse_propensity.raw / avg_logged_reward, delta=1e-6, ) self.assertAlmostEqual(doubly_robust.raw, direct_method.raw + 1 / 0.5 * (5 - 4) / 3, delta=1e-6) self.assertAlmostEqual(doubly_robust.normalized, doubly_robust.raw / avg_logged_reward, delta=1e-6) self.assertAlmostEqual(switch_ips.raw, inverse_propensity.raw, delta=1e-6) self.assertAlmostEqual(switch_dm.raw, direct_method.raw, delta=1e-6) self.assertAlmostEqual(switch_dr_dr.raw, doubly_robust.raw, delta=1e-6) self.assertAlmostEqual(switch_dr_dm.raw, direct_method.raw, delta=1e-6) logger.info( "---------- Finish evaluating eval_greedy=True -----------------") logger.info( "---------- Start evaluating eval_greedy=False -----------------") edp = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, reward_net, ptb.training_input, eval_greedy=False) doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator()) dm_estimator = OPEstimatorAdapter(DMEstimator()) ips_estimator = OPEstimatorAdapter(IPSEstimator()) doubly_robust = doubly_robust_estimator.estimate(edp) inverse_propensity = ips_estimator.estimate(edp) direct_method = dm_estimator.estimate(edp) self.assertAlmostEqual( inverse_propensity.raw, (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3, delta=1e-6, ) self.assertAlmostEqual( inverse_propensity.normalized, inverse_propensity.raw / avg_logged_reward, delta=1e-6, ) logger.info( "---------- Finish evaluating eval_greedy=False -----------------")
def _simulated_training_input(self, training_input, sim_tgt_out_idx, sim_distance, device): batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape _, max_src_seq_len, candidate_feat_dim = ( training_input.src_seq.float_features.shape) # candidates + padding_symbol + decoder_start_symbol candidate_size = max_src_seq_len + 2 src_seq_augment = torch.zeros(batch_size, candidate_size, candidate_feat_dim, device=device) src_seq_augment[:, 2:, :] = training_input.src_seq.float_features sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long() sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1] sim_tgt_in_seq = rlt.PreprocessedFeatureVector( float_features=src_seq_augment[ torch.arange(batch_size, device=device ).repeat_interleave( # type: ignore max_tgt_seq_len), sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)) sim_tgt_out_seq = rlt.PreprocessedFeatureVector( float_features=src_seq_augment[ torch.arange(batch_size, device=device ).repeat_interleave( # type: ignore max_tgt_seq_len), sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)) sim_tgt_out_probs = torch.tensor([1.0 / len(self.permutation_index)], device=self.device).repeat(batch_size) if self.reward_net is None: self.reward_net = _load_reward_net(self.reward_net_path, self.use_gpu) slate_reward = (self.reward_net( training_input.state.float_features, training_input.src_seq.float_features, sim_tgt_out_seq.float_features, training_input.src_src_mask, sim_tgt_out_idx, ).squeeze().detach()) # guard-rail reward prediction range reward_clamp = self.parameters.simulation_reward_clamp if reward_clamp is not None: slate_reward = torch.clamp(slate_reward, min=reward_clamp.clamp_min, max=reward_clamp.clamp_max) # guard-rail sequence similarity distance_penalty = self.parameters.simulation_distance_penalty if distance_penalty is not None: slate_reward += distance_penalty * (self.MAX_DISTANCE - sim_distance) on_policy_input = rlt.PreprocessedRankingInput( state=training_input.state, src_seq=training_input.src_seq, src_src_mask=training_input.src_src_mask, tgt_in_seq=sim_tgt_in_seq, tgt_out_seq=sim_tgt_out_seq, tgt_tgt_mask=training_input.tgt_tgt_mask, slate_reward=slate_reward, src_in_idx=training_input.src_in_idx, tgt_in_idx=sim_tgt_in_idx, tgt_out_idx=sim_tgt_out_idx, tgt_out_probs=sim_tgt_out_probs, ) return on_policy_input
def _simulated_training_input( self, training_input, sim_tgt_out_idx, sim_distance, device ): batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape ( _, max_src_seq_len, candidate_feat_dim, ) = training_input.src_seq.float_features.shape # candidates + padding_symbol + decoder_start_symbol candidate_size = max_src_seq_len + 2 src_seq_augment = torch.zeros( batch_size, candidate_size, candidate_feat_dim, device=device ) src_seq_augment[:, 2:, :] = training_input.src_seq.float_features sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long() sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1] sim_tgt_in_seq = rlt.FeatureData( float_features=src_seq_augment[ torch.arange(batch_size, device=device).repeat_interleave( max_tgt_seq_len ), sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len, candidate_feat_dim) ) sim_tgt_out_seq = rlt.FeatureData( float_features=src_seq_augment[ torch.arange(batch_size, device=device).repeat_interleave( max_tgt_seq_len ), sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len, candidate_feat_dim) ) sim_tgt_out_probs = torch.tensor( [1.0 / len(self.permutation_index)], device=self.device ).repeat(batch_size) if self.reward_net is None: self.reward_net = _load_reward_net(self.reward_net_path, self.use_gpu) slate_reward = self.reward_net( training_input.state.float_features, training_input.src_seq.float_features, sim_tgt_out_seq.float_features, training_input.src_src_mask, sim_tgt_out_idx, ).detach() if slate_reward.ndim == 1: logger.warning(f"Slate reward should be 2-D tensor, unsqueezing") slate_reward = slate_reward.unsqueeze(1) elif slate_reward.ndim != 2: raise RuntimeError("Expect slate reward to be 2-D tensor") # guard-rail reward prediction range reward_clamp = self.parameters.simulation_reward_clamp if reward_clamp is not None: slate_reward = torch.clamp( slate_reward, min=reward_clamp.clamp_min, max=reward_clamp.clamp_max ) # guard-rail sequence similarity distance_penalty = self.parameters.simulation_distance_penalty if distance_penalty is not None: slate_reward += distance_penalty * (self.MAX_DISTANCE - sim_distance) assert ( len(slate_reward.shape) == 2 and slate_reward.shape[1] == 1 ), f"{slate_reward.shape}" on_policy_input = rlt.PreprocessedRankingInput( state=training_input.state, src_seq=training_input.src_seq, src_src_mask=training_input.src_src_mask, tgt_in_seq=sim_tgt_in_seq, tgt_out_seq=sim_tgt_out_seq, tgt_tgt_mask=training_input.tgt_tgt_mask, slate_reward=slate_reward, src_in_idx=training_input.src_in_idx, tgt_in_idx=sim_tgt_in_idx, tgt_out_idx=sim_tgt_out_idx, tgt_out_probs=sim_tgt_out_probs, ) return on_policy_input
def _simulated_training_input(self, training_input, sim_tgt_out_idx, sim_distance, device): batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape ( _, max_src_seq_len, candidate_feat_dim, ) = training_input.src_seq.float_features.shape # candidates + padding_symbol + decoder_start_symbol candidate_size = max_src_seq_len + 2 src_seq_augment = torch.zeros(batch_size, candidate_size, candidate_feat_dim, device=device) src_seq_augment[:, 2:, :] = training_input.src_seq.float_features sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long() sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1] sim_tgt_in_seq = rlt.FeatureData(float_features=src_seq_augment[ torch.arange(batch_size, device=device ).repeat_interleave(max_tgt_seq_len), sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)) sim_tgt_out_seq = rlt.FeatureData(float_features=src_seq_augment[ torch.arange(batch_size, device=device ).repeat_interleave(max_tgt_seq_len), sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)) sim_tgt_out_probs = torch.tensor([1.0 / len(self.permutation_index)], device=self.device).repeat(batch_size) if not self.reward_name_and_net: self.reward_name_and_net = _load_reward_net( self.sim_param.reward_name_path, self.use_gpu) sim_slate_reward = torch.zeros_like(training_input.slate_reward) for name, reward_net in self.reward_name_and_net.items(): weight = self.sim_param.reward_name_weight[name] sr = reward_net( training_input.state.float_features, training_input.src_seq.float_features, sim_tgt_out_seq.float_features, training_input.src_src_mask, sim_tgt_out_idx, ).detach() assert sr.ndim == 2, f"Slate reward {name} output should be 2-D tensor" sim_slate_reward += weight * sr # guard-rail reward prediction range reward_clamp = self.sim_param.reward_clamp if reward_clamp is not None: sim_slate_reward = torch.clamp(sim_slate_reward, min=reward_clamp.clamp_min, max=reward_clamp.clamp_max) # guard-rail sequence similarity distance_penalty = self.sim_param.distance_penalty if distance_penalty is not None: sim_slate_reward += distance_penalty * (self.MAX_DISTANCE - sim_distance) assert (len(sim_slate_reward.shape) == 2 and sim_slate_reward.shape[1] == 1), f"{sim_slate_reward.shape}" on_policy_input = rlt.PreprocessedRankingInput( state=training_input.state, src_seq=training_input.src_seq, src_src_mask=training_input.src_src_mask, tgt_in_seq=sim_tgt_in_seq, tgt_out_seq=sim_tgt_out_seq, tgt_tgt_mask=training_input.tgt_tgt_mask, slate_reward=sim_slate_reward, src_in_idx=training_input.src_in_idx, tgt_in_idx=sim_tgt_in_idx, tgt_out_idx=sim_tgt_out_idx, tgt_out_probs=sim_tgt_out_probs, ) return on_policy_input