Example #1
0
    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> ParametricDQNTrainer:
        net_builder = self.net_builder.value
        # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`.
        self._q_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
        )
        # Metrics + reward
        reward_options = reward_options or RewardOptions()
        metrics_to_score = get_metrics_to_score(
            reward_options.metric_reward_values)
        reward_output_dim = len(metrics_to_score) + 1
        reward_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
            output_dim=reward_output_dim,
        )

        q_network_target = self._q_network.get_target_network()
        return ParametricDQNTrainer(
            q_network=self._q_network,
            q_network_target=q_network_target,
            reward_network=reward_network,
            # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
Example #2
0
    def build_trainer(self) -> ParametricDQNTrainer:
        net_builder = self.net_builder.value
        # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`.
        # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`.
        self._q_network = net_builder.build_q_network(
            self.state_normalization_data, self.action_normalization_data)
        # Metrics + reward
        reward_output_dim = len(self.metrics_to_score) + 1
        reward_network = net_builder.build_q_network(
            self.state_normalization_data,
            self.action_normalization_data,
            output_dim=reward_output_dim,
        )

        if self.use_gpu:
            self._q_network = self._q_network.cuda()
            reward_network = reward_network.cuda()

        q_network_target = self._q_network.get_target_network()
        # pyre-fixme[29]: `Type[ParametricDQNTrainer]` is not a function.
        # pyre-fixme[29]: `Type[ParametricDQNTrainer]` is not a function.
        return ParametricDQNTrainer(
            q_network=self._q_network,
            q_network_target=q_network_target,
            reward_network=reward_network,
            use_gpu=self.use_gpu,
            # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute
            #  `asdict`.
            # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
Example #3
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedCritic(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedCritic(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu:
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    # pyre-fixme[28]: Unexpected keyword argument `rl`.
    # pyre-fixme[28]: Unexpected keyword argument `rl`.
    trainer_parameters = ParametricDQNTrainerParameters(
        rl=model.rl,
        double_q_learning=model.rainbow.double_q_learning,
        minibatch_size=model.training.minibatch_size,
        optimizer=OptimizerParameters(
            optimizer=model.training.optimizer,
            learning_rate=model.training.learning_rate,
            l2_decay=model.training.l2_decay,
        ),
    )

    return ParametricDQNTrainer(
        q_network,
        q_network_target,
        reward_network,
        use_gpu=use_gpu,
        # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute `asdict`.
        **trainer_parameters.asdict())
Example #4
0
    def create_from_tensors_parametric_dqn(
        cls,
        trainer: ParametricDQNTrainer,
        mdp_ids: torch.Tensor,
        sequence_numbers: torch.Tensor,
        states: rlt.FeatureData,
        actions: rlt.FeatureData,
        propensities: torch.Tensor,
        rewards: torch.Tensor,
        possible_actions_mask: torch.Tensor,
        possible_actions: rlt.FeatureData,
        max_num_actions: int,
        metrics: Optional[torch.Tensor] = None,
    ):
        old_q_train_state = trainer.q_network.training
        old_reward_train_state = trainer.reward_network.training
        trainer.q_network.train(False)
        trainer.reward_network.train(False)

        tiled_state = states.float_features.repeat(1, max_num_actions).reshape(
            -1, states.float_features.shape[1])
        assert possible_actions is not None
        # Get Q-value of action taken
        possible_actions_state_concat = (rlt.FeatureData(tiled_state),
                                         possible_actions)

        # FIXME: model_values, model_values_for_logged_action, and model_metrics_values
        # should be calculated using q_network_cpe (as in discrete dqn).
        # q_network_cpe has not been added in parametric dqn yet.
        model_values = trainer.q_network(*possible_actions_state_concat)
        optimal_q_values, _ = trainer.get_detached_q_values(
            *possible_actions_state_concat)
        eval_action_idxs = None

        assert (model_values.shape[1] == 1
                and model_values.shape[0] == possible_actions_mask.shape[0] *
                possible_actions_mask.shape[1]), (
                    "Invalid shapes: " + str(model_values.shape) + " != " +
                    str(possible_actions_mask.shape))
        model_values = model_values.reshape(possible_actions_mask.shape)
        optimal_q_values = optimal_q_values.reshape(
            possible_actions_mask.shape)
        model_propensities = masked_softmax(optimal_q_values,
                                            possible_actions_mask,
                                            trainer.rl_temperature)

        rewards_and_metric_rewards = trainer.reward_network(
            *possible_actions_state_concat)
        model_rewards = rewards_and_metric_rewards[:, :1]
        assert (model_rewards.shape[0] *
                model_rewards.shape[1] == possible_actions_mask.shape[0] *
                possible_actions_mask.shape[1]), (
                    "Invalid shapes: " + str(model_rewards.shape) + " != " +
                    str(possible_actions_mask.shape))
        model_rewards = model_rewards.reshape(possible_actions_mask.shape)

        model_metrics = rewards_and_metric_rewards[:, 1:]
        model_metrics = model_metrics.reshape(possible_actions_mask.shape[0],
                                              -1)

        model_values_for_logged_action = trainer.q_network(states, actions)
        model_rewards_and_metrics_for_logged_action = trainer.reward_network(
            states, actions)
        model_rewards_for_logged_action = model_rewards_and_metrics_for_logged_action[:, :
                                                                                      1]

        action_dim = possible_actions.float_features.shape[1]
        action_mask = torch.all(
            possible_actions.float_features.view(
                -1, max_num_actions,
                action_dim) == actions.float_features.unsqueeze(dim=1),
            dim=2,
        ).float()
        assert torch.all(action_mask.sum(dim=1) == 1)
        num_metrics = model_metrics.shape[1] // max_num_actions

        model_metrics_values = None
        model_metrics_for_logged_action = None
        model_metrics_values_for_logged_action = None
        if num_metrics > 0:
            # FIXME: calculate model_metrics_values when q_network_cpe is added
            # to parametric dqn
            model_metrics_values = model_values.repeat(1, num_metrics)

        trainer.q_network.train(old_q_train_state)
        trainer.reward_network.train(old_reward_train_state)

        return cls(
            mdp_id=mdp_ids,
            sequence_number=sequence_numbers,
            logged_propensities=propensities,
            logged_rewards=rewards,
            action_mask=action_mask,
            model_rewards=model_rewards,
            model_rewards_for_logged_action=model_rewards_for_logged_action,
            model_values=model_values,
            model_values_for_logged_action=model_values_for_logged_action,
            model_metrics_values=model_metrics_values,
            model_metrics_values_for_logged_action=
            model_metrics_values_for_logged_action,
            model_propensities=model_propensities,
            logged_metrics=metrics,
            model_metrics=model_metrics,
            model_metrics_for_logged_action=model_metrics_for_logged_action,
            # Will compute later
            logged_values=None,
            logged_metrics_values=None,
            possible_actions_mask=possible_actions_mask,
            optimal_q_values=optimal_q_values,
            eval_action_idxs=eval_action_idxs,
        )