Exemple #1
0
 def as_parametric_maxq_training_batch(self):
     state_dim = self.states.shape[1]
     return rlt.PreprocessedTrainingBatch(
         training_input=rlt.PreprocessedParametricDqnInput(
             state=rlt.PreprocessedFeatureVector(float_features=self.states),
             action=rlt.PreprocessedFeatureVector(float_features=self.actions),
             next_state=rlt.PreprocessedFeatureVector(
                 float_features=self.next_states
             ),
             next_action=rlt.PreprocessedFeatureVector(
                 float_features=self.next_actions
             ),
             tiled_next_state=rlt.PreprocessedFeatureVector(
                 float_features=self.possible_next_actions_state_concat[
                     :, :state_dim
                 ]
             ),
             possible_actions=None,
             possible_actions_mask=self.possible_actions_mask,
             possible_next_actions=rlt.PreprocessedFeatureVector(
                 float_features=self.possible_next_actions_state_concat[
                     :, state_dim:
                 ]
             ),
             possible_next_actions_mask=self.possible_next_actions_mask,
             reward=self.rewards,
             not_terminal=self.not_terminal,
             step=self.step,
             time_diff=self.time_diffs,
         ),
         extras=rlt.ExtraData(),
     )
Exemple #2
0
    def sample_memories(self, batch_size, use_gpu=False, batch_first=False):
        """
        :param batch_size: number of samples to return
        :param use_gpu: whether to put samples on gpu
        :param batch_first: If True, the first dimension of data is batch_size.
            If False (default), the first dimension is SEQ_LEN. Therefore,
            state's shape is SEQ_LEN x BATCH_SIZE x STATE_DIM, for example. By default,
            MDN-RNN consumes data with SEQ_LEN as the first dimension.
        """
        sample_indices = np.random.randint(self.memory_size, size=batch_size)
        device = torch.device("cuda") if use_gpu else torch.device("cpu")
        # state/next state shape: batch_size x seq_len x state_dim
        # action shape: batch_size x seq_len x action_dim
        # reward/not_terminal shape: batch_size x seq_len
        state, action, next_state, reward, not_terminal = map(
            lambda x: stack(x).float().to(device),
            zip(*self.deque_sample(sample_indices)),
        )

        if not batch_first:
            state, action, next_state, reward, not_terminal = transpose(
                state, action, next_state, reward, not_terminal)

        training_input = rlt.PreprocessedMemoryNetworkInput(
            state=rlt.PreprocessedFeatureVector(float_features=state),
            reward=reward,
            time_diff=torch.ones_like(reward).float(),
            action=action,
            next_state=rlt.PreprocessedFeatureVector(
                float_features=next_state),
            not_terminal=not_terminal,
            step=None,
        )
        return rlt.PreprocessedTrainingBatch(training_input=training_input,
                                             extras=None)
Exemple #3
0
 def preprocess_batch(train_batch: Any) -> rlt.PreprocessedTrainingBatch:
     obs, action, reward, next_obs, next_action, next_reward, terminal, idxs, possible_actions_mask, log_prob = (
         train_batch)
     obs = torch.tensor(obs).squeeze(2)
     action = torch.tensor(action).float()
     reward = torch.tensor(reward).unsqueeze(1)
     next_obs = torch.tensor(next_obs).squeeze(2)
     next_action = torch.tensor(next_action)
     not_terinal = 1.0 - torch.tensor(terminal).unsqueeze(1).float()
     idxs = torch.tensor(idxs)
     possible_actions_mask = torch.tensor(possible_actions_mask).float()
     log_prob = torch.tensor(log_prob)
     return rlt.PreprocessedTrainingBatch(
         training_input=rlt.PreprocessedPolicyNetworkInput(
             state=rlt.PreprocessedFeatureVector(float_features=obs),
             action=rlt.PreprocessedFeatureVector(float_features=action),
             next_state=rlt.PreprocessedFeatureVector(
                 float_features=next_obs),
             next_action=rlt.PreprocessedFeatureVector(
                 float_features=next_action),
             reward=reward,
             not_terminal=not_terinal,
             step=None,
             time_diff=None,
         ),
         extras=rlt.ExtraData(),
     )
 def as_slate_q_training_batch(self):
     batch_size, state_dim = self.states.shape
     action_dim = self.actions.shape[1]
     return rlt.PreprocessedTrainingBatch(
         training_input=rlt.PreprocessedSlateQInput(
             state=rlt.PreprocessedFeatureVector(
                 float_features=self.states),
             next_state=rlt.PreprocessedFeatureVector(
                 float_features=self.next_states),
             tiled_state=rlt.PreprocessedTiledFeatureVector(
                 float_features=self.
                 possible_actions_state_concat[:, :state_dim].view(
                     batch_size, -1, state_dim)),
             tiled_next_state=rlt.PreprocessedTiledFeatureVector(
                 float_features=self.
                 possible_next_actions_state_concat[:, :state_dim].view(
                     batch_size, -1, state_dim)),
             action=rlt.PreprocessedSlateFeatureVector(
                 float_features=self.
                 possible_actions_state_concat[:, state_dim:].view(
                     batch_size, -1, action_dim),
                 item_mask=self.possible_actions_mask,
                 item_probability=self.propensities,
             ),
             next_action=rlt.PreprocessedSlateFeatureVector(
                 float_features=self.
                 possible_next_actions_state_concat[:, state_dim:].view(
                     batch_size, -1, action_dim),
                 item_mask=self.possible_next_actions_mask,
                 item_probability=self.next_propensities,
             ),
             reward=self.rewards,
             reward_mask=self.rewards_mask,
             time_diff=self.time_diffs,
             step=self.step,
             not_terminal=self.not_terminal,
         ),
         extras=rlt.ExtraData(
             mdp_id=self.mdp_ids,
             sequence_number=self.sequence_numbers,
             action_probability=self.propensities,
             max_num_actions=self.max_num_actions,
             metrics=self.metrics,
         ),
     )
Exemple #5
0
 def as_policy_network_training_batch(self):
     return rlt.PreprocessedTrainingBatch(
         training_input=rlt.PreprocessedPolicyNetworkInput(
             state=rlt.PreprocessedFeatureVector(float_features=self.states),
             action=rlt.PreprocessedFeatureVector(float_features=self.actions),
             next_state=rlt.PreprocessedFeatureVector(
                 float_features=self.next_states
             ),
             next_action=rlt.PreprocessedFeatureVector(
                 float_features=self.next_actions
             ),
             reward=self.rewards,
             not_terminal=self.not_terminal,
             step=self.step,
             time_diff=self.time_diffs,
         ),
         extras=rlt.ExtraData(),
     )
Exemple #6
0
    def as_cem_training_batch(self, batch_first=False):
        """
        Generate one-step samples needed by CEM trainer.
        The samples will be used to train an ensemble of world models used by CEM.

        If batch_first = True:
            state/next state shape: batch_size x 1 x state_dim
            action shape: batch_size x 1 x action_dim
            reward/terminal shape: batch_size x 1
        else (default):
             state/next state shape: 1 x batch_size x state_dim
             action shape: 1 x batch_size x action_dim
             reward/terminal shape: 1 x batch_size
        """
        if batch_first:
            seq_len_dim = 1
            reward, not_terminal = self.rewards, self.not_terminal
        else:
            seq_len_dim = 0
            reward, not_terminal = transpose(self.rewards, self.not_terminal)
        training_input = rlt.PreprocessedMemoryNetworkInput(
            state=rlt.PreprocessedFeatureVector(
                float_features=self.states.unsqueeze(seq_len_dim)),
            action=self.actions.unsqueeze(seq_len_dim),
            next_state=rlt.PreprocessedFeatureVector(
                float_features=self.next_states.unsqueeze(seq_len_dim)),
            reward=reward,
            not_terminal=not_terminal,
            step=self.step,
            time_diff=self.time_diffs,
        )
        return rlt.PreprocessedTrainingBatch(
            training_input=training_input,
            extras=rlt.ExtraData(
                mdp_id=self.mdp_ids,
                sequence_number=self.sequence_numbers,
                action_probability=self.propensities,
                max_num_actions=self.max_num_actions,
                metrics=self.metrics,
            ),
        )
Exemple #7
0
    def preprocess_batch(train_batch: Any) -> rlt.PreprocessedTrainingBatch:
        obs, action, reward, next_obs, next_action, next_reward, terminal, idxs, possible_actions_mask, log_prob = (
            train_batch)
        batch_size = obs.shape[0]

        obs = torch.tensor(obs).squeeze(2)
        action = torch.tensor(action).float()
        next_obs = torch.tensor(next_obs).squeeze(2)
        next_action = torch.tensor(next_action).to(torch.float32)
        reward = torch.tensor(reward).unsqueeze(1)
        not_terminal = 1 - torch.tensor(terminal).unsqueeze(1).to(torch.uint8)
        possible_actions_mask = torch.ones_like(action).to(torch.bool)

        tiled_next_state = torch.repeat_interleave(next_obs,
                                                   repeats=num_actions,
                                                   axis=0)
        possible_next_actions = torch.eye(num_actions).repeat(batch_size, 1)
        possible_next_actions_mask = not_terminal.repeat(1, num_actions).to(
            torch.bool)
        return rlt.PreprocessedTrainingBatch(
            rlt.PreprocessedParametricDqnInput(
                state=rlt.PreprocessedFeatureVector(float_features=obs),
                action=rlt.PreprocessedFeatureVector(float_features=action),
                next_state=rlt.PreprocessedFeatureVector(
                    float_features=next_obs),
                next_action=rlt.PreprocessedFeatureVector(
                    float_features=next_action),
                possible_actions=None,
                possible_actions_mask=possible_actions_mask,
                possible_next_actions=rlt.PreprocessedFeatureVector(
                    float_features=possible_next_actions),
                possible_next_actions_mask=possible_next_actions_mask,
                tiled_next_state=rlt.PreprocessedFeatureVector(
                    float_features=tiled_next_state),
                reward=reward,
                not_terminal=not_terminal,
                step=None,
                time_diff=None,
            ),
            extras=rlt.ExtraData(),
        )
Exemple #8
0
 def preprocess_batch(train_batch: Any) -> rlt.PreprocessedTrainingBatch:
     obs, action, reward, next_obs, next_action, next_reward, terminal, idxs, possible_actions_mask, log_prob = (
         train_batch)
     obs = torch.tensor(obs).squeeze(2)
     action = torch.tensor(action)
     reward = torch.tensor(reward).unsqueeze(1)
     next_obs = torch.tensor(next_obs).squeeze(2)
     next_action = torch.tensor(next_action)
     not_terminal = 1.0 - torch.tensor(terminal).unsqueeze(1).float()
     possible_actions_mask = torch.tensor(possible_actions_mask)
     next_possible_actions_mask = not_terminal.repeat(1, num_actions)
     log_prob = torch.tensor(log_prob)
     assert (
         action.size(1) == num_actions
     ), f"action size(1) is {action.size(1)} while num_actions is {num_actions}"
     return rlt.PreprocessedTrainingBatch(
         training_input=rlt.PreprocessedDiscreteDqnInput(
             state=rlt.PreprocessedFeatureVector(float_features=obs),
             action=action,
             next_state=rlt.PreprocessedFeatureVector(
                 float_features=next_obs),
             next_action=next_action,
             possible_actions_mask=possible_actions_mask,
             possible_next_actions_mask=next_possible_actions_mask,
             reward=reward,
             not_terminal=not_terminal,
             step=None,
             time_diff=None,
         ),
         extras=rlt.ExtraData(
             mdp_id=None,
             sequence_number=None,
             action_probability=log_prob.exp(),
             max_num_actions=None,
             metrics=None,
         ),
     )
Exemple #9
0
 def as_discrete_maxq_training_batch(self):
     return rlt.PreprocessedTrainingBatch(
         training_input=rlt.PreprocessedDiscreteDqnInput(
             state=rlt.PreprocessedFeatureVector(float_features=self.states),
             action=self.actions,
             next_state=rlt.PreprocessedFeatureVector(
                 float_features=self.next_states
             ),
             next_action=self.next_actions,
             possible_actions_mask=self.possible_actions_mask,
             possible_next_actions_mask=self.possible_next_actions_mask,
             reward=self.rewards,
             not_terminal=self.not_terminal,
             step=self.step,
             time_diff=self.time_diffs,
         ),
         extras=rlt.ExtraData(
             mdp_id=self.mdp_ids,
             sequence_number=self.sequence_numbers,
             action_probability=self.propensities,
             max_num_actions=self.max_num_actions,
             metrics=self.metrics,
         ),
     )
    def test_seq2slate_eval_data_page(self):
        """
        Create 3 slate ranking logs and evaluate using Direct Method, Inverse
        Propensity Scores, and Doubly Robust.

        The logs are as follows:
        state: [1, 0, 0], [0, 1, 0], [0, 0, 1]
        indices in logged slates: [3, 2], [3, 2], [3, 2]
        model output indices: [2, 3], [3, 2], [2, 3]
        logged reward: 4, 5, 7
        logged propensities: 0.2, 0.5, 0.4
        predicted rewards on logged slates: 2, 4, 6
        predicted rewards on model outputted slates: 1, 4, 5

        Direct Method uses the predicted rewards on model outputted slates.
        Thus the result is expected to be (1 + 4 + 5) / 3

        Inverse Propensity Scores would scale the reward by 1.0 / logged propensities
        whenever the model output slate matches with the logged slate.
        Since only the second log matches with the model output, the IPS result
        is expected to be 5 / 0.5 / 3

        Doubly Robust is the sum of the direct method result and propensity-scaled
        reward difference; the latter is defined as:
        1.0 / logged_propensities * (logged reward - predicted reward on logged slate)
         * Indicator(model slate == logged slate)
        Since only the second logged slate matches with the model outputted slate,
        the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3
        """
        batch_size = 3
        state_dim = 3
        src_seq_len = 2
        tgt_seq_len = 2
        candidate_dim = 2

        reward_net = FakeSeq2SlateRewardNetwork()
        seq2slate_net = FakeSeq2SlateTransformerNet()
        baseline_net = nn.Linear(1, 1)
        trainer = Seq2SlateTrainer(
            seq2slate_net,
            baseline_net,
            parameters=None,
            minibatch_size=3,
            use_gpu=False,
        )

        src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1)
        tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]])
        tgt_out_seq = src_seq[torch.arange(batch_size).
                              repeat_interleave(tgt_seq_len),  # type: ignore
                              tgt_out_idx.flatten() - 2, ].reshape(
                                  batch_size, tgt_seq_len, candidate_dim)

        ptb = rlt.PreprocessedTrainingBatch(
            training_input=rlt.PreprocessedRankingInput(
                state=rlt.PreprocessedFeatureVector(
                    float_features=torch.eye(state_dim)),
                src_seq=rlt.PreprocessedFeatureVector(float_features=src_seq),
                tgt_out_seq=rlt.PreprocessedFeatureVector(
                    float_features=tgt_out_seq),
                src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len),
                tgt_out_idx=tgt_out_idx,
                tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]),
                slate_reward=torch.tensor([4.0, 5.0, 7.0]),
            ),
            extras=rlt.ExtraData(
                sequence_number=torch.tensor([0, 0, 0]),
                mdp_id=np.array(["0", "1", "2"]),
            ),
        )

        edp = EvaluationDataPage.create_from_training_batch(
            ptb, trainer, reward_net)
        doubly_robust_estimator = DoublyRobustEstimator()
        direct_method, inverse_propensity, doubly_robust = doubly_robust_estimator.estimate(
            edp)
        logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}")

        avg_logged_reward = (4 + 5 + 7) / 3
        self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6)
        self.assertAlmostEqual(direct_method.normalized,
                               direct_method.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6)
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        self.assertAlmostEqual(doubly_robust.raw,
                               direct_method.raw + 1 / 0.5 * (5 - 4) / 3,
                               delta=1e-6)
        self.assertAlmostEqual(doubly_robust.normalized,
                               doubly_robust.raw / avg_logged_reward,
                               delta=1e-6)