Python PreprocessedRankingInput Examples

Programming Language: Python

Namespace/Package Name: reagent.types

Method/Function: PreprocessedRankingInput

Examples at hotexamples.com: 5

Python PreprocessedRankingInput - 5 examples found. These are the top rated real world Python examples of reagent.types.PreprocessedRankingInput extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def forward(
     self,
     state: torch.Tensor,
     src_seq: torch.Tensor,
     tgt_out_seq: torch.Tensor,
     src_src_mask: torch.Tensor,
     tgt_out_idx: torch.Tensor,
 ) -> torch.Tensor:
     return self.model(
         rlt.PreprocessedRankingInput(
             state=rlt.FeatureData(float_features=state),
             src_seq=rlt.FeatureData(float_features=src_seq),
             tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq),
             src_src_mask=src_src_mask,
             tgt_out_idx=tgt_out_idx,
         )).predicted_reward

Example #2

Show file

    def test_seq2slate_eval_data_page(self):
        """
        Create 3 slate ranking logs and evaluate using Direct Method, Inverse
        Propensity Scores, and Doubly Robust.

        The logs are as follows:
        state: [1, 0, 0], [0, 1, 0], [0, 0, 1]
        indices in logged slates: [3, 2], [3, 2], [3, 2]
        model output indices: [2, 3], [3, 2], [2, 3]
        logged reward: 4, 5, 7
        logged propensities: 0.2, 0.5, 0.4
        predicted rewards on logged slates: 2, 4, 6
        predicted rewards on model outputted slates: 1, 4, 5
        predicted propensities: 0.4, 0.3, 0.7

        When eval_greedy=True:

        Direct Method uses the predicted rewards on model outputted slates.
        Thus the result is expected to be (1 + 4 + 5) / 3

        Inverse Propensity Scores would scale the reward by 1.0 / logged propensities
        whenever the model output slate matches with the logged slate.
        Since only the second log matches with the model output, the IPS result
        is expected to be 5 / 0.5 / 3

        Doubly Robust is the sum of the direct method result and propensity-scaled
        reward difference; the latter is defined as:
        1.0 / logged_propensities * (logged reward - predicted reward on logged slate)
         * Indicator(model slate == logged slate)
        Since only the second logged slate matches with the model outputted slate,
        the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3


        When eval_greedy=False:

        Only Inverse Propensity Scores would be accurate. Because it would be too
        expensive to compute all possible slates' propensities and predicted rewards
        for Direct Method.

        The expected IPS = (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3
        """
        batch_size = 3
        state_dim = 3
        src_seq_len = 2
        tgt_seq_len = 2
        candidate_dim = 2

        reward_net = FakeSeq2SlateRewardNetwork()
        seq2slate_net = FakeSeq2SlateTransformerNet()

        src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1)
        tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]])
        tgt_out_seq = src_seq[
            torch.arange(batch_size).repeat_interleave(tgt_seq_len),
            tgt_out_idx.flatten() - 2, ].reshape(batch_size, tgt_seq_len,
                                                 candidate_dim)

        ptb = rlt.PreprocessedTrainingBatch(
            training_input=rlt.PreprocessedRankingInput(
                state=rlt.FeatureData(float_features=torch.eye(state_dim)),
                src_seq=rlt.FeatureData(float_features=src_seq),
                tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq),
                src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len),
                tgt_out_idx=tgt_out_idx,
                tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]),
                slate_reward=torch.tensor([4.0, 5.0, 7.0]),
            ),
            extras=rlt.ExtraData(
                sequence_number=torch.tensor([0, 0, 0]),
                mdp_id=np.array(["0", "1", "2"]),
            ),
        )

        edp = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net, reward_net, ptb.training_input, eval_greedy=True)
        logger.info(
            "---------- Start evaluating eval_greedy=True -----------------")
        doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator())
        dm_estimator = OPEstimatorAdapter(DMEstimator())
        ips_estimator = OPEstimatorAdapter(IPSEstimator())
        switch_estimator = OPEstimatorAdapter(SwitchEstimator())
        switch_dr_estimator = OPEstimatorAdapter(SwitchDREstimator())

        doubly_robust = doubly_robust_estimator.estimate(edp)
        inverse_propensity = ips_estimator.estimate(edp)
        direct_method = dm_estimator.estimate(edp)

        # Verify that Switch with low exponent is equivalent to IPS
        switch_ips = switch_estimator.estimate(edp, exp_base=1)
        # Verify that Switch with no candidates is equivalent to DM
        switch_dm = switch_estimator.estimate(edp, candidates=0)
        # Verify that SwitchDR with low exponent is equivalent to DR
        switch_dr_dr = switch_dr_estimator.estimate(edp, exp_base=1)
        # Verify that SwitchDR with no candidates is equivalent to DM
        switch_dr_dm = switch_dr_estimator.estimate(edp, candidates=0)

        logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}")

        avg_logged_reward = (4 + 5 + 7) / 3
        self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6)
        self.assertAlmostEqual(direct_method.normalized,
                               direct_method.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6)
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        self.assertAlmostEqual(doubly_robust.raw,
                               direct_method.raw + 1 / 0.5 * (5 - 4) / 3,
                               delta=1e-6)
        self.assertAlmostEqual(doubly_robust.normalized,
                               doubly_robust.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(switch_ips.raw,
                               inverse_propensity.raw,
                               delta=1e-6)
        self.assertAlmostEqual(switch_dm.raw, direct_method.raw, delta=1e-6)
        self.assertAlmostEqual(switch_dr_dr.raw, doubly_robust.raw, delta=1e-6)
        self.assertAlmostEqual(switch_dr_dm.raw, direct_method.raw, delta=1e-6)
        logger.info(
            "---------- Finish evaluating eval_greedy=True -----------------")

        logger.info(
            "---------- Start evaluating eval_greedy=False -----------------")
        edp = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net, reward_net, ptb.training_input, eval_greedy=False)
        doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator())
        dm_estimator = OPEstimatorAdapter(DMEstimator())
        ips_estimator = OPEstimatorAdapter(IPSEstimator())

        doubly_robust = doubly_robust_estimator.estimate(edp)
        inverse_propensity = ips_estimator.estimate(edp)
        direct_method = dm_estimator.estimate(edp)
        self.assertAlmostEqual(
            inverse_propensity.raw,
            (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3,
            delta=1e-6,
        )
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        logger.info(
            "---------- Finish evaluating eval_greedy=False -----------------")

Example #3

Show file

File: seq2slate_sim_trainer.py Project: jayhsieh/ReAgent

    def _simulated_training_input(self, training_input, sim_tgt_out_idx,
                                  sim_distance, device):
        batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape
        _, max_src_seq_len, candidate_feat_dim = (
            training_input.src_seq.float_features.shape)

        # candidates + padding_symbol + decoder_start_symbol
        candidate_size = max_src_seq_len + 2
        src_seq_augment = torch.zeros(batch_size,
                                      candidate_size,
                                      candidate_feat_dim,
                                      device=device)
        src_seq_augment[:, 2:, :] = training_input.src_seq.float_features

        sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long()
        sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL
        sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1]

        sim_tgt_in_seq = rlt.PreprocessedFeatureVector(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device
                             ).repeat_interleave(  # type: ignore
                                 max_tgt_seq_len),
                sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                                 candidate_feat_dim))
        sim_tgt_out_seq = rlt.PreprocessedFeatureVector(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device
                             ).repeat_interleave(  # type: ignore
                                 max_tgt_seq_len),
                sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                                  candidate_feat_dim))
        sim_tgt_out_probs = torch.tensor([1.0 / len(self.permutation_index)],
                                         device=self.device).repeat(batch_size)

        if self.reward_net is None:
            self.reward_net = _load_reward_net(self.reward_net_path,
                                               self.use_gpu)
        slate_reward = (self.reward_net(
            training_input.state.float_features,
            training_input.src_seq.float_features,
            sim_tgt_out_seq.float_features,
            training_input.src_src_mask,
            sim_tgt_out_idx,
        ).squeeze().detach())
        # guard-rail reward prediction range
        reward_clamp = self.parameters.simulation_reward_clamp
        if reward_clamp is not None:
            slate_reward = torch.clamp(slate_reward,
                                       min=reward_clamp.clamp_min,
                                       max=reward_clamp.clamp_max)
        # guard-rail sequence similarity
        distance_penalty = self.parameters.simulation_distance_penalty
        if distance_penalty is not None:
            slate_reward += distance_penalty * (self.MAX_DISTANCE -
                                                sim_distance)

        on_policy_input = rlt.PreprocessedRankingInput(
            state=training_input.state,
            src_seq=training_input.src_seq,
            src_src_mask=training_input.src_src_mask,
            tgt_in_seq=sim_tgt_in_seq,
            tgt_out_seq=sim_tgt_out_seq,
            tgt_tgt_mask=training_input.tgt_tgt_mask,
            slate_reward=slate_reward,
            src_in_idx=training_input.src_in_idx,
            tgt_in_idx=sim_tgt_in_idx,
            tgt_out_idx=sim_tgt_out_idx,
            tgt_out_probs=sim_tgt_out_probs,
        )
        return on_policy_input

Example #4

Show file

File: seq2slate_sim_trainer.py Project: hermes2k/ReAgent

    def _simulated_training_input(
        self, training_input, sim_tgt_out_idx, sim_distance, device
    ):
        batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape
        (
            _,
            max_src_seq_len,
            candidate_feat_dim,
        ) = training_input.src_seq.float_features.shape

        # candidates + padding_symbol + decoder_start_symbol
        candidate_size = max_src_seq_len + 2
        src_seq_augment = torch.zeros(
            batch_size, candidate_size, candidate_feat_dim, device=device
        )
        src_seq_augment[:, 2:, :] = training_input.src_seq.float_features

        sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long()
        sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL
        sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1]

        sim_tgt_in_seq = rlt.FeatureData(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device).repeat_interleave(
                    max_tgt_seq_len
                ),
                sim_tgt_in_idx.flatten(),
            ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)
        )
        sim_tgt_out_seq = rlt.FeatureData(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device).repeat_interleave(
                    max_tgt_seq_len
                ),
                sim_tgt_out_idx.flatten(),
            ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)
        )
        sim_tgt_out_probs = torch.tensor(
            [1.0 / len(self.permutation_index)], device=self.device
        ).repeat(batch_size)

        if self.reward_net is None:
            self.reward_net = _load_reward_net(self.reward_net_path, self.use_gpu)
        slate_reward = self.reward_net(
            training_input.state.float_features,
            training_input.src_seq.float_features,
            sim_tgt_out_seq.float_features,
            training_input.src_src_mask,
            sim_tgt_out_idx,
        ).detach()
        if slate_reward.ndim == 1:
            logger.warning(f"Slate reward should be 2-D tensor, unsqueezing")
            slate_reward = slate_reward.unsqueeze(1)
        elif slate_reward.ndim != 2:
            raise RuntimeError("Expect slate reward to be 2-D tensor")
        # guard-rail reward prediction range
        reward_clamp = self.parameters.simulation_reward_clamp
        if reward_clamp is not None:
            slate_reward = torch.clamp(
                slate_reward, min=reward_clamp.clamp_min, max=reward_clamp.clamp_max
            )
        # guard-rail sequence similarity
        distance_penalty = self.parameters.simulation_distance_penalty
        if distance_penalty is not None:
            slate_reward += distance_penalty * (self.MAX_DISTANCE - sim_distance)

        assert (
            len(slate_reward.shape) == 2 and slate_reward.shape[1] == 1
        ), f"{slate_reward.shape}"

        on_policy_input = rlt.PreprocessedRankingInput(
            state=training_input.state,
            src_seq=training_input.src_seq,
            src_src_mask=training_input.src_src_mask,
            tgt_in_seq=sim_tgt_in_seq,
            tgt_out_seq=sim_tgt_out_seq,
            tgt_tgt_mask=training_input.tgt_tgt_mask,
            slate_reward=slate_reward,
            src_in_idx=training_input.src_in_idx,
            tgt_in_idx=sim_tgt_in_idx,
            tgt_out_idx=sim_tgt_out_idx,
            tgt_out_probs=sim_tgt_out_probs,
        )
        return on_policy_input

Example #5

Show file

    def _simulated_training_input(self, training_input, sim_tgt_out_idx,
                                  sim_distance, device):
        batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape
        (
            _,
            max_src_seq_len,
            candidate_feat_dim,
        ) = training_input.src_seq.float_features.shape

        # candidates + padding_symbol + decoder_start_symbol
        candidate_size = max_src_seq_len + 2
        src_seq_augment = torch.zeros(batch_size,
                                      candidate_size,
                                      candidate_feat_dim,
                                      device=device)
        src_seq_augment[:, 2:, :] = training_input.src_seq.float_features

        sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long()
        sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL
        sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1]

        sim_tgt_in_seq = rlt.FeatureData(float_features=src_seq_augment[
            torch.arange(batch_size, device=device
                         ).repeat_interleave(max_tgt_seq_len),
            sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                             candidate_feat_dim))
        sim_tgt_out_seq = rlt.FeatureData(float_features=src_seq_augment[
            torch.arange(batch_size, device=device
                         ).repeat_interleave(max_tgt_seq_len),
            sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                              candidate_feat_dim))
        sim_tgt_out_probs = torch.tensor([1.0 / len(self.permutation_index)],
                                         device=self.device).repeat(batch_size)

        if not self.reward_name_and_net:
            self.reward_name_and_net = _load_reward_net(
                self.sim_param.reward_name_path, self.use_gpu)

        sim_slate_reward = torch.zeros_like(training_input.slate_reward)
        for name, reward_net in self.reward_name_and_net.items():
            weight = self.sim_param.reward_name_weight[name]
            sr = reward_net(
                training_input.state.float_features,
                training_input.src_seq.float_features,
                sim_tgt_out_seq.float_features,
                training_input.src_src_mask,
                sim_tgt_out_idx,
            ).detach()
            assert sr.ndim == 2, f"Slate reward {name} output should be 2-D tensor"
            sim_slate_reward += weight * sr

        # guard-rail reward prediction range
        reward_clamp = self.sim_param.reward_clamp
        if reward_clamp is not None:
            sim_slate_reward = torch.clamp(sim_slate_reward,
                                           min=reward_clamp.clamp_min,
                                           max=reward_clamp.clamp_max)
        # guard-rail sequence similarity
        distance_penalty = self.sim_param.distance_penalty
        if distance_penalty is not None:
            sim_slate_reward += distance_penalty * (self.MAX_DISTANCE -
                                                    sim_distance)

        assert (len(sim_slate_reward.shape) == 2 and sim_slate_reward.shape[1]
                == 1), f"{sim_slate_reward.shape}"

        on_policy_input = rlt.PreprocessedRankingInput(
            state=training_input.state,
            src_seq=training_input.src_seq,
            src_src_mask=training_input.src_src_mask,
            tgt_in_seq=sim_tgt_in_seq,
            tgt_out_seq=sim_tgt_out_seq,
            tgt_tgt_mask=training_input.tgt_tgt_mask,
            slate_reward=sim_slate_reward,
            src_in_idx=training_input.src_in_idx,
            tgt_in_idx=sim_tgt_in_idx,
            tgt_out_idx=sim_tgt_out_idx,
            tgt_out_probs=sim_tgt_out_probs,
        )
        return on_policy_input