Esempio n. 1
0
    def build_trainer(self) -> DQNTrainer:
        net_builder = self.net_builder.value
        q_network = net_builder.build_q_network(
            self.state_feature_config,
            self.state_normalization_parameters,
            # pyre-fixme[16]: `DiscreteDQN` has no attribute `action_names`.
            # pyre-fixme[16]: `DiscreteDQN` has no attribute `action_names`.
            len(self.action_names),
        )

        if self.use_gpu:
            q_network = q_network.cuda()

        q_network_target = q_network.get_target_network()

        reward_network, q_network_cpe, q_network_cpe_target = None, None, None
        if self.trainer_param.evaluation.calc_cpe_in_training:
            # Metrics + reward
            num_output_nodes = (len(self.metrics_to_score) + 1) * len(
                self.trainer_param.actions)

            cpe_net_builder = self.cpe_net_builder.value
            reward_network = cpe_net_builder.build_q_network(
                self.state_feature_config,
                self.state_normalization_parameters,
                num_output_nodes,
            )
            q_network_cpe = cpe_net_builder.build_q_network(
                self.state_feature_config,
                self.state_normalization_parameters,
                num_output_nodes,
            )

            if self.use_gpu:
                reward_network.cuda()
                q_network_cpe.cuda()

            q_network_cpe_target = q_network_cpe.get_target_network()

        # pyre-fixme[16]: `DiscreteDQN` has no attribute `_q_network`.
        # pyre-fixme[16]: `DiscreteDQN` has no attribute `_q_network`.
        self._q_network = q_network
        trainer = DQNTrainer(
            q_network,
            q_network_target,
            reward_network,
            self.trainer_param,
            self.use_gpu,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=self.metrics_to_score,
            loss_reporter=NoOpLossReporter(),
        )
        return trainer
Esempio n. 2
0
    def create_from_tensors_dqn(
        cls,
        trainer: DQNTrainer,
        mdp_ids: torch.Tensor,
        sequence_numbers: torch.Tensor,
        states: rlt.FeatureData,
        actions: rlt.FeatureData,
        propensities: torch.Tensor,
        rewards: torch.Tensor,
        possible_actions_mask: torch.Tensor,
        metrics: Optional[torch.Tensor] = None,
    ):
        old_q_train_state = trainer.q_network.training
        # pyre-fixme[16]: `DQNTrainer` has no attribute `reward_network`.
        old_reward_train_state = trainer.reward_network.training
        # pyre-fixme[16]: `DQNTrainer` has no attribute `q_network_cpe`.
        old_q_cpe_train_state = trainer.q_network_cpe.training
        trainer.q_network.train(False)
        trainer.reward_network.train(False)
        trainer.q_network_cpe.train(False)

        num_actions = trainer.num_actions
        action_mask = actions.float()

        rewards = trainer.boost_rewards(rewards, actions)
        model_values = trainer.q_network_cpe(states)[:, 0:num_actions]
        optimal_q_values, _ = trainer.get_detached_q_values(states)
        # Do we ever really use eval_action_idxs?
        eval_action_idxs = trainer.get_max_q_values(optimal_q_values,
                                                    possible_actions_mask)[1]
        model_propensities = masked_softmax(optimal_q_values,
                                            possible_actions_mask,
                                            trainer.rl_temperature)
        assert model_values.shape == actions.shape, ("Invalid shape: " +
                                                     str(model_values.shape) +
                                                     " != " +
                                                     str(actions.shape))
        assert model_values.shape == possible_actions_mask.shape, (
            "Invalid shape: " + str(model_values.shape) + " != " +
            str(possible_actions_mask.shape))
        model_values_for_logged_action = torch.sum(model_values * action_mask,
                                                   dim=1,
                                                   keepdim=True)

        rewards_and_metric_rewards = trainer.reward_network(states)

        # In case we reuse the modular for Q-network
        if hasattr(rewards_and_metric_rewards, "q_values"):
            rewards_and_metric_rewards = rewards_and_metric_rewards

        model_rewards = rewards_and_metric_rewards[:, 0:num_actions]
        assert model_rewards.shape == actions.shape, (
            "Invalid shape: " + str(model_rewards.shape) + " != " +
            str(actions.shape))
        model_rewards_for_logged_action = torch.sum(model_rewards *
                                                    action_mask,
                                                    dim=1,
                                                    keepdim=True)

        model_metrics = rewards_and_metric_rewards[:, num_actions:]

        assert model_metrics.shape[1] % num_actions == 0, (
            "Invalid metrics shape: " + str(model_metrics.shape) + " " +
            str(num_actions))
        num_metrics = model_metrics.shape[1] // num_actions

        if num_metrics == 0:
            model_metrics_values = None
            model_metrics_for_logged_action = None
            model_metrics_values_for_logged_action = None
        else:
            model_metrics_values = trainer.q_network_cpe(states)
            # Backward compatility
            if hasattr(model_metrics_values, "q_values"):
                model_metrics_values = model_metrics_values
            model_metrics_values = model_metrics_values[:, num_actions:]
            assert model_metrics_values.shape[
                1] == num_actions * num_metrics, (
                    "Invalid shape: " + str(model_metrics_values.shape[1]) +
                    " != " + str(actions.shape[1] * num_metrics))

            model_metrics_for_logged_action_list = []
            model_metrics_values_for_logged_action_list = []
            for metric_index in range(num_metrics):
                metric_start = metric_index * num_actions
                metric_end = (metric_index + 1) * num_actions
                model_metrics_for_logged_action_list.append(
                    torch.sum(
                        model_metrics[:, metric_start:metric_end] *
                        action_mask,
                        dim=1,
                        keepdim=True,
                    ))

                model_metrics_values_for_logged_action_list.append(
                    torch.sum(
                        model_metrics_values[:, metric_start:metric_end] *
                        action_mask,
                        dim=1,
                        keepdim=True,
                    ))
            model_metrics_for_logged_action = torch.cat(
                model_metrics_for_logged_action_list, dim=1)
            model_metrics_values_for_logged_action = torch.cat(
                model_metrics_values_for_logged_action_list, dim=1)

        trainer.q_network_cpe.train(old_q_cpe_train_state)
        trainer.q_network.train(old_q_train_state)
        trainer.reward_network.train(old_reward_train_state)

        return cls(
            mdp_id=mdp_ids,
            sequence_number=sequence_numbers,
            logged_propensities=propensities,
            logged_rewards=rewards,
            action_mask=action_mask,
            model_rewards=model_rewards,
            model_rewards_for_logged_action=model_rewards_for_logged_action,
            model_values=model_values,
            model_values_for_logged_action=model_values_for_logged_action,
            model_metrics_values=model_metrics_values,
            model_metrics_values_for_logged_action=
            model_metrics_values_for_logged_action,
            model_propensities=model_propensities,
            logged_metrics=metrics,
            model_metrics=model_metrics,
            model_metrics_for_logged_action=model_metrics_for_logged_action,
            # Will compute later
            logged_values=None,
            logged_metrics_values=None,
            possible_actions_mask=possible_actions_mask,
            optimal_q_values=optimal_q_values,
            eval_action_idxs=eval_action_idxs,
        )
Esempio n. 3
0
    def get_modular_sarsa_trainer_reward_boost(
        self,
        environment,
        reward_shape,
        dueling,
        categorical,
        quantile,
        use_gpu=False,
        use_all_avail_gpus=False,
        clip_grad_norm=None,
    ):
        assert not quantile or not categorical
        parameters = self.get_sarsa_parameters(
            environment, reward_shape, dueling, categorical, quantile, clip_grad_norm
        )

        def make_dueling_dqn(num_atoms=None):
            return models.DuelingQNetwork.make_fully_connected(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                layers=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
                num_atoms=num_atoms,
            )

        if quantile:
            if dueling:
                q_network = make_dueling_dqn(num_atoms=parameters.rainbow.num_atoms)

            else:
                q_network = models.FullyConnectedDQN(
                    state_dim=get_num_output_features(environment.normalization),
                    action_dim=len(environment.ACTIONS),
                    num_atoms=parameters.rainbow.num_atoms,
                    sizes=parameters.training.layers[1:-1],
                    activations=parameters.training.activations[:-1],
                )
        elif categorical:
            assert not dueling
            distributional_network = models.FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                num_atoms=parameters.rainbow.num_atoms,
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
            q_network = models.CategoricalDQN(
                distributional_network,
                qmin=-100,
                qmax=200,
                num_atoms=parameters.rainbow.num_atoms,
            )
        else:
            if dueling:
                q_network = make_dueling_dqn()
            else:
                q_network = models.FullyConnectedDQN(
                    state_dim=get_num_output_features(environment.normalization),
                    action_dim=len(environment.ACTIONS),
                    sizes=parameters.training.layers[1:-1],
                    activations=parameters.training.activations[:-1],
                )

        q_network_cpe, q_network_cpe_target, reward_network = None, None, None

        if parameters.evaluation and parameters.evaluation.calc_cpe_in_training:
            q_network_cpe = models.FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
            q_network_cpe_target = q_network_cpe.get_target_network()
            reward_network = models.FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )

        if use_gpu:
            q_network = q_network.cuda()
            if parameters.evaluation.calc_cpe_in_training:
                reward_network = reward_network.cuda()
                q_network_cpe = q_network_cpe.cuda()
                q_network_cpe_target = q_network_cpe_target.cuda()
            if use_all_avail_gpus and not categorical:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model()
                q_network_cpe = q_network_cpe.get_distributed_data_parallel_model()
                q_network_cpe_target = (
                    q_network_cpe_target.get_distributed_data_parallel_model()
                )

        if quantile:
            parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters(
                parameters
            )
            trainer = QRDQNTrainer(
                q_network,
                q_network.get_target_network(),
                parameters,
                use_gpu,
                reward_network=reward_network,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe_target,
            )
        elif categorical:
            parameters = C51TrainerParameters.from_discrete_action_model_parameters(
                parameters
            )
            trainer = C51Trainer(
                q_network, q_network.get_target_network(), parameters, use_gpu
            )
        else:
            parameters = DQNTrainerParameters.from_discrete_action_model_parameters(
                parameters
            )
            trainer = DQNTrainer(
                q_network,
                q_network.get_target_network(),
                reward_network,
                parameters,
                use_gpu,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe_target,
            )
        return trainer
Esempio n. 4
0
def create_dqn_trainer_from_params(
    model: DiscreteActionModelParameters,
    normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
    metrics_to_score=None,
):
    metrics_to_score = metrics_to_score or []

    if model.rainbow.quantile:
        q_network = QuantileDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
    elif model.rainbow.categorical:
        q_network = CategoricalDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            qmin=model.rainbow.qmin,
            qmax=model.rainbow.qmax,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
            use_gpu=use_gpu,
        )
    elif model.rainbow.dueling_architecture:
        q_network = DuelingQNetwork(  # type: ignore
            layers=[get_num_output_features(normalization_parameters)] +
            model.training.layers[1:-1] + [len(model.actions)],
            activations=model.training.activations,
        )
    else:
        q_network = FullyConnectedDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()

    q_network_target = q_network.get_target_network()

    reward_network, q_network_cpe, q_network_cpe_target = None, None, None
    if model.evaluation.calc_cpe_in_training:
        # Metrics + reward
        num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions)
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
        q_network_cpe = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

        if use_gpu and torch.cuda.is_available():
            reward_network.cuda()
            q_network_cpe.cuda()

        q_network_cpe_target = q_network_cpe.get_target_network()

    if (use_all_avail_gpus and not model.rainbow.categorical
            and not model.rainbow.quantile):
        q_network = q_network.get_distributed_data_parallel_model()
        reward_network = (reward_network.get_distributed_data_parallel_model()
                          if reward_network else None)
        q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model()
                         if q_network_cpe else None)

    if model.rainbow.quantile:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters(
            model)
        return QRDQNTrainer(
            q_network,
            q_network_target,
            parameters,
            use_gpu,
            metrics_to_score=metrics_to_score,
            reward_network=reward_network,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
        )

    elif model.rainbow.categorical:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        return C51Trainer(
            q_network,
            q_network_target,
            C51TrainerParameters.from_discrete_action_model_parameters(model),
            use_gpu,
            metrics_to_score=metrics_to_score,
        )

    else:
        parameters = DQNTrainerParameters.from_discrete_action_model_parameters(
            model)
        return DQNTrainer(
            q_network,
            q_network_target,
            reward_network,
            parameters,
            use_gpu,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=metrics_to_score,
        )