def evaluate(self, batch: MemoryNetworkInput):
        """Calculate state feature sensitivity due to actions:
        randomly permutating actions and see how much the prediction of next
        state feature deviates."""
        assert isinstance(batch, MemoryNetworkInput)

        self.trainer.memory_network.mdnrnn.eval()

        seq_len, batch_size, state_dim = batch.next_state.float_features.size()
        state_feature_num = self.state_feature_num
        feature_sensitivity = torch.zeros(state_feature_num)

        # the input of world_model has seq-len as the first dimension
        mdnrnn_output = self.trainer.memory_network(
            batch.state, FeatureData(batch.action)
        )
        predicted_next_state_means = mdnrnn_output.mus

        shuffled_mdnrnn_output = self.trainer.memory_network(
            batch.state,
            # shuffle the actions
            FeatureData(batch.action[:, torch.randperm(batch_size), :]),
        )
        shuffled_predicted_next_state_means = shuffled_mdnrnn_output.mus

        assert (
            predicted_next_state_means.size()
            == shuffled_predicted_next_state_means.size()
            == (seq_len, batch_size, self.trainer.params.num_gaussians, state_dim)
        )

        state_feature_boundaries = self.sorted_state_feature_start_indices + [state_dim]
        for i in range(state_feature_num):
            boundary_start, boundary_end = (
                state_feature_boundaries[i],
                state_feature_boundaries[i + 1],
            )
            abs_diff = torch.mean(
                torch.sum(
                    torch.abs(
                        shuffled_predicted_next_state_means[
                            :, :, :, boundary_start:boundary_end
                        ]
                        - predicted_next_state_means[
                            :, :, :, boundary_start:boundary_end
                        ]
                    ),
                    dim=3,
                )
            )
            feature_sensitivity[i] = abs_diff.cpu().detach().item()

        self.trainer.memory_network.mdnrnn.train()
        logger.info(
            "**** Debug tool feature sensitivity ****: {}".format(feature_sensitivity)
        )
        return {"feature_sensitivity": feature_sensitivity.numpy()}
Example #2
0
 def act(self,
         obs: rlt.FeatureData,
         possible_actions_mask: Optional[np.ndarray] = None
         ) -> rlt.ActorOutput:
     """Act randomly regardless of the observation."""
     # pyre-fixme[35]: Target cannot be annotated.
     obs: torch.Tensor = obs.float_features
     assert obs.dim() >= 2, f"obs has shape {obs.shape} (dim < 2)"
     batch_size = obs.size(0)
     # pyre-fixme[6]: Expected `Union[torch.Size, torch.Tensor]` for 1st param
     #  but got `Tuple[int]`.
     action = self.dist.sample((batch_size, ))
     # sum over action_dim (since assuming i.i.d. per coordinate)
     log_prob = self.dist.log_prob(action).sum(1)
     return rlt.ActorOutput(action=action, log_prob=log_prob)
Example #3
0
 def test_get_detached_model_outputs(self):
     trainer = self._construct_trainer()
     action_scores, _ = trainer.get_detached_model_outputs(
         FeatureData(float_features=torch.rand(self.batch_size, self.state_dim))
     )
     self.assertEqual(action_scores.shape[0], self.batch_size)
     self.assertEqual(action_scores.shape[1], self.action_dim)
Example #4
0
def get_parametric_input(max_num_actions: int, obs: rlt.FeatureData):
    assert (len(obs.float_features.shape) == 2
            ), f"{obs.float_features.shape} is not (batch_size, state_dim)."
    batch_size, _ = obs.float_features.shape
    possible_actions = get_possible_actions_for_gym(
        batch_size, max_num_actions).to(obs.float_features.device)
    return obs.get_tiled_batch(max_num_actions), possible_actions
Example #5
0
 def _get_unmasked_q_values(self, q_network, state: rlt.FeatureData,
                            slate: rlt.DocList) -> torch.Tensor:
     """ Gets the q values from the model and target networks """
     batch_size, slate_size, _ = slate.float_features.shape
     # TODO: Probably should create a new model type
     return q_network(state.repeat_interleave(slate_size, dim=0),
                      slate.as_feature_data()).view(batch_size, slate_size)
Example #6
0
    def score(state: rlt.FeatureData) -> torch.Tensor:
        tiled_state = state.repeat_interleave(repeats=num_candidates, axis=0)
        candidate_docs = state.candidate_docs
        assert candidate_docs is not None
        actions = candidate_docs.as_feature_data()

        q_network.eval()
        scores = q_network(tiled_state, actions).view(-1, num_candidates)
        q_network.train()

        select_prob = F.softmax(candidate_docs.value, dim=1)
        assert select_prob.shape == scores.shape

        return select_prob * scores
Example #7
0
 def test_fully_connected(self):
     chooser = ValueNetBuilder__Union(
         FullyConnected=value.fully_connected.FullyConnected()
     )
     builder = chooser.value
     state_dim = 3
     normalization_data = NormalizationData(
         dense_normalization_parameters={
             i: NormalizationParameters(feature_type=CONTINUOUS)
             for i in range(state_dim)
         }
     )
     value_network = builder.build_value_network(normalization_data)
     batch_size = 5
     x = FeatureData(float_features=torch.randn(batch_size, state_dim))
     y = value_network(x)
     self.assertEqual(y.shape, (batch_size, 1))
Example #8
0
 def setUp(self):
     # preparing various components for qr-dqn trainer initialization
     self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11)
     self.reward_options = RewardOptions()
     self.metrics_to_score = get_metrics_to_score(
         self.reward_options.metric_reward_values
     )
     self.state_dim = 10
     self.action_dim = 2
     self.sizes = [20, 20]
     self.num_atoms = 11
     self.activations = ["relu", "relu"]
     self.dropout_ratio = 0
     self.q_network = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.action_dim,
         sizes=self.sizes,
         num_atoms=self.num_atoms,
         activations=self.activations,
         dropout_ratio=self.dropout_ratio,
     )
     self.q_network_target = self.q_network.get_target_network()
     self.x = FeatureData(float_features=torch.rand(5, 10))
     self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
     self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
         # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`.
         self.params.actions
     )
     self.reward_network = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.num_output_nodes,
         sizes=self.sizes,
         activations=self.activations,
     )
     self.q_network_cpe = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.num_output_nodes,
         sizes=self.sizes,
         activations=self.activations,
     )
     self.q_network_cpe_target = self.q_network_cpe.get_target_network()
Example #9
0
    def act(self,
            obs: rlt.FeatureData,
            possible_actions_mask: Optional[np.ndarray] = None
            ) -> rlt.ActorOutput:
        """Act randomly regardless of the observation."""
        # pyre-fixme[35]: Target cannot be annotated.
        obs: torch.Tensor = obs.float_features
        assert obs.dim() >= 2, f"obs has shape {obs.shape} (dim < 2)"
        assert obs.shape[0] == 1, f"obs has shape {obs.shape} (0th dim != 1)"
        batch_size = obs.shape[0]
        scores = torch.ones((batch_size, self.num_actions))
        scores = apply_possible_actions_mask(scores,
                                             possible_actions_mask,
                                             invalid_score=0.0)

        # sample a random action
        m = torch.distributions.Categorical(scores)
        raw_action = m.sample()
        action = F.one_hot(raw_action, self.num_actions)
        log_prob = m.log_prob(raw_action).float()
        return rlt.ActorOutput(action=action, log_prob=log_prob)
Example #10
0
    def create_from_tensors_dqn(
        cls,
        trainer: DQNTrainer,
        mdp_ids: torch.Tensor,
        sequence_numbers: torch.Tensor,
        states: rlt.FeatureData,
        actions: rlt.FeatureData,
        propensities: torch.Tensor,
        rewards: torch.Tensor,
        possible_actions_mask: torch.Tensor,
        metrics: Optional[torch.Tensor] = None,
    ):
        old_q_train_state = trainer.q_network.training
        # pyre-fixme[16]: `DQNTrainer` has no attribute `reward_network`.
        old_reward_train_state = trainer.reward_network.training
        # pyre-fixme[16]: `DQNTrainer` has no attribute `q_network_cpe`.
        old_q_cpe_train_state = trainer.q_network_cpe.training
        trainer.q_network.train(False)
        trainer.reward_network.train(False)
        trainer.q_network_cpe.train(False)

        num_actions = trainer.num_actions
        action_mask = actions.float()

        rewards = trainer.boost_rewards(rewards, actions)
        model_values = trainer.q_network_cpe(states)[:, 0:num_actions]
        optimal_q_values, _ = trainer.get_detached_q_values(states)
        # Do we ever really use eval_action_idxs?
        eval_action_idxs = trainer.get_max_q_values(optimal_q_values,
                                                    possible_actions_mask)[1]
        model_propensities = masked_softmax(optimal_q_values,
                                            possible_actions_mask,
                                            trainer.rl_temperature)
        assert model_values.shape == actions.shape, ("Invalid shape: " +
                                                     str(model_values.shape) +
                                                     " != " +
                                                     str(actions.shape))
        assert model_values.shape == possible_actions_mask.shape, (
            "Invalid shape: " + str(model_values.shape) + " != " +
            str(possible_actions_mask.shape))
        model_values_for_logged_action = torch.sum(model_values * action_mask,
                                                   dim=1,
                                                   keepdim=True)

        rewards_and_metric_rewards = trainer.reward_network(states)

        # In case we reuse the modular for Q-network
        if hasattr(rewards_and_metric_rewards, "q_values"):
            rewards_and_metric_rewards = rewards_and_metric_rewards

        model_rewards = rewards_and_metric_rewards[:, 0:num_actions]
        assert model_rewards.shape == actions.shape, (
            "Invalid shape: " + str(model_rewards.shape) + " != " +
            str(actions.shape))
        model_rewards_for_logged_action = torch.sum(model_rewards *
                                                    action_mask,
                                                    dim=1,
                                                    keepdim=True)

        model_metrics = rewards_and_metric_rewards[:, num_actions:]

        assert model_metrics.shape[1] % num_actions == 0, (
            "Invalid metrics shape: " + str(model_metrics.shape) + " " +
            str(num_actions))
        num_metrics = model_metrics.shape[1] // num_actions

        if num_metrics == 0:
            model_metrics_values = None
            model_metrics_for_logged_action = None
            model_metrics_values_for_logged_action = None
        else:
            model_metrics_values = trainer.q_network_cpe(states)
            # Backward compatility
            if hasattr(model_metrics_values, "q_values"):
                model_metrics_values = model_metrics_values
            model_metrics_values = model_metrics_values[:, num_actions:]
            assert model_metrics_values.shape[
                1] == num_actions * num_metrics, (
                    "Invalid shape: " + str(model_metrics_values.shape[1]) +
                    " != " + str(actions.shape[1] * num_metrics))

            model_metrics_for_logged_action_list = []
            model_metrics_values_for_logged_action_list = []
            for metric_index in range(num_metrics):
                metric_start = metric_index * num_actions
                metric_end = (metric_index + 1) * num_actions
                model_metrics_for_logged_action_list.append(
                    torch.sum(
                        model_metrics[:, metric_start:metric_end] *
                        action_mask,
                        dim=1,
                        keepdim=True,
                    ))

                model_metrics_values_for_logged_action_list.append(
                    torch.sum(
                        model_metrics_values[:, metric_start:metric_end] *
                        action_mask,
                        dim=1,
                        keepdim=True,
                    ))
            model_metrics_for_logged_action = torch.cat(
                model_metrics_for_logged_action_list, dim=1)
            model_metrics_values_for_logged_action = torch.cat(
                model_metrics_values_for_logged_action_list, dim=1)

        trainer.q_network_cpe.train(old_q_cpe_train_state)
        trainer.q_network.train(old_q_train_state)
        trainer.reward_network.train(old_reward_train_state)

        return cls(
            mdp_id=mdp_ids,
            sequence_number=sequence_numbers,
            logged_propensities=propensities,
            logged_rewards=rewards,
            action_mask=action_mask,
            model_rewards=model_rewards,
            model_rewards_for_logged_action=model_rewards_for_logged_action,
            model_values=model_values,
            model_values_for_logged_action=model_values_for_logged_action,
            model_metrics_values=model_metrics_values,
            model_metrics_values_for_logged_action=
            model_metrics_values_for_logged_action,
            model_propensities=model_propensities,
            logged_metrics=metrics,
            model_metrics=model_metrics,
            model_metrics_for_logged_action=model_metrics_for_logged_action,
            # Will compute later
            logged_values=None,
            logged_metrics_values=None,
            possible_actions_mask=possible_actions_mask,
            optimal_q_values=optimal_q_values,
            eval_action_idxs=eval_action_idxs,
        )
Example #11
0
    def setUp(self):
        # preparing various components for qr-dqn trainer initialization
        self.batch_size = 3
        self.state_dim = 10
        self.action_dim = 2
        self.num_layers = 2
        self.sizes = [20 for _ in range(self.num_layers)]
        self.num_atoms = 11
        self.activations = ["relu" for _ in range(self.num_layers)]
        self.dropout_ratio = 0
        self.exploration_variance = 1e-10

        self.actions = [str(i) for i in range(self.action_dim)]
        self.params = CRRTrainerParameters(actions=self.actions)
        self.reward_options = RewardOptions()
        self.metrics_to_score = get_metrics_to_score(
            self.reward_options.metric_reward_values
        )

        self.actor_network = FullyConnectedActor(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            exploration_variance=self.exploration_variance,
        )
        self.actor_network_target = self.actor_network.get_target_network()

        self.q1_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q1_network_target = self.q1_network.get_target_network()

        self.q2_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q2_network_target = self.q2_network.get_target_network()

        self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
            self.params.actions
        )
        self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
        self.reward_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe_target = self.q_network_cpe.get_target_network()
        self.inp = DiscreteDqnInput(
            state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            next_state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            reward=torch.ones(self.batch_size, 1),
            time_diff=torch.ones(self.batch_size, 1) * 2,
            step=torch.ones(self.batch_size, 1) * 2,
            not_terminal=torch.ones(
                self.batch_size, 1
            ),  # todo: check terminal behavior
            action=torch.tensor([[0, 1], [1, 0], [0, 1]]),
            next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]),
            possible_actions_mask=torch.ones(self.batch_size, self.action_dim),
            possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim),
            extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)),
        )
Example #12
0
 def _concat_features(self, obs: rlt.FeatureData):
     if self.has_user_feat:
         return obs.concat_user_doc()
     else:
         # pyre-fixme[16]: `Optional` has no attribute `float_features`.
         return obs.candidate_docs.float_features.float()
    def evaluate(self, batch: MemoryNetworkInput):
        """Calculate feature importance: setting each state/action feature to
        the mean value and observe loss increase."""

        self.trainer.memory_network.mdnrnn.eval()
        state_features = batch.state.float_features
        action_features = batch.action
        seq_len, batch_size, state_dim = state_features.size()
        action_dim = action_features.size()[2]
        action_feature_num = self.action_feature_num
        state_feature_num = self.state_feature_num
        feature_importance = torch.zeros(action_feature_num + state_feature_num)

        orig_losses = self.trainer.get_loss(batch, state_dim=state_dim)
        orig_loss = orig_losses["loss"].cpu().detach().item()
        del orig_losses

        action_feature_boundaries = self.sorted_action_feature_start_indices + [
            action_dim
        ]
        state_feature_boundaries = self.sorted_state_feature_start_indices + [state_dim]

        for i in range(action_feature_num):
            action_features = batch.action.reshape(
                (batch_size * seq_len, action_dim)
            ).data.clone()

            # if actions are discrete, an action's feature importance is the loss
            # increase due to setting all actions to this action
            if self.discrete_action:
                assert action_dim == action_feature_num
                action_vec = torch.zeros(action_dim)
                action_vec[i] = 1
                action_features[:] = action_vec
            # if actions are continuous, an action's feature importance is the loss
            # increase due to masking this action feature to its mean value
            else:
                boundary_start, boundary_end = (
                    action_feature_boundaries[i],
                    action_feature_boundaries[i + 1],
                )
                action_features[
                    :, boundary_start:boundary_end
                ] = self.compute_median_feature_value(
                    action_features[:, boundary_start:boundary_end]
                )

            action_features = action_features.reshape((seq_len, batch_size, action_dim))
            new_batch = MemoryNetworkInput(
                state=batch.state,
                action=action_features,
                next_state=batch.next_state,
                reward=batch.reward,
                time_diff=torch.ones_like(batch.reward).float(),
                not_terminal=batch.not_terminal,
                step=None,
            )
            losses = self.trainer.get_loss(new_batch, state_dim=state_dim)
            feature_importance[i] = losses["loss"].cpu().detach().item() - orig_loss
            del losses

        for i in range(state_feature_num):
            state_features = batch.state.float_features.reshape(
                (batch_size * seq_len, state_dim)
            ).data.clone()
            boundary_start, boundary_end = (
                state_feature_boundaries[i],
                state_feature_boundaries[i + 1],
            )
            state_features[
                :, boundary_start:boundary_end
            ] = self.compute_median_feature_value(
                state_features[:, boundary_start:boundary_end]
            )
            state_features = state_features.reshape((seq_len, batch_size, state_dim))
            new_batch = MemoryNetworkInput(
                state=FeatureData(float_features=state_features),
                action=batch.action,
                next_state=batch.next_state,
                reward=batch.reward,
                time_diff=torch.ones_like(batch.reward).float(),
                not_terminal=batch.not_terminal,
                step=None,
            )
            losses = self.trainer.get_loss(new_batch, state_dim=state_dim)
            feature_importance[i + action_feature_num] = (
                losses["loss"].cpu().detach().item() - orig_loss
            )
            del losses

        self.trainer.memory_network.mdnrnn.train()
        logger.info(
            "**** Debug tool feature importance ****: {}".format(feature_importance)
        )
        return {"feature_loss_increase": feature_importance.numpy()}
Example #14
0
 def forward(self, obs: rlt.FeatureData):
     mlp_input = obs.get_ranking_state(self.has_user_feat)
     scores = self.mlp(mlp_input)
     return scores.squeeze(-1)
Example #15
0
    def test_train_step_gen(self):
        inp = DiscreteDqnInput(
            state=FeatureData(float_features=torch.rand(3, 10)),
            next_state=FeatureData(float_features=torch.rand(3, 10)),
            reward=torch.ones(3, 1),
            time_diff=torch.ones(3, 1) * 2,
            step=torch.ones(3, 1) * 2,
            not_terminal=torch.ones(3, 1),  # todo: check terminal behavior
            action=torch.tensor([[0, 1], [1, 0], [0, 1]]),
            next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]),
            possible_actions_mask=torch.ones(3, 2),
            possible_next_actions_mask=torch.ones(3, 2),
            extras=ExtraData(),
        )
        mse_backward_type = type(
            torch.nn.functional.mse_loss(
                torch.tensor([1.0], requires_grad=True), torch.zeros(1)
            ).grad_fn
        )
        add_backward_type = type(
            (
                torch.tensor([1.0], requires_grad=True)
                + torch.tensor([1.0], requires_grad=True)
            ).grad_fn
        )
        mean_backward_type = type(
            torch.tensor([1.0, 2.0], requires_grad=True).mean().grad_fn
        )

        # vanilla
        trainer = self._construct_trainer()
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)
        self.assertEqual(type(losses[0].grad_fn), mean_backward_type)
        self.assertEqual(type(losses[1].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[2].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[3].grad_fn), add_backward_type)

        # no CPE
        trainer = self._construct_trainer(no_cpe=True)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 2)

        # seq_num
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"],
            num_atoms=11,
            rl=RLParameters(use_seq_num_diff_as_time_diff=True),
        )
        trainer = self._construct_trainer(new_params=param_copy)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)

        # multi_steps
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"], num_atoms=11, rl=RLParameters(multi_steps=2)
        )
        trainer = self._construct_trainer(new_params=param_copy)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)

        # non_max_q
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"], num_atoms=11, rl=RLParameters(maxq_learning=False)
        )
        trainer = self._construct_trainer(new_params=param_copy)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)
 def extract_state_first_step(batch):
     return FeatureData(batch.state.float_features[0])