Beispiel #1
0
    def configure_optimizers(self):
        optimizers = []
        optimizers.append(
            self.q_network_optimizer.make_optimizer_scheduler(
                self.q_network.parameters()))
        if self.calc_cpe_in_training:
            optimizers.append(
                self.reward_network_optimizer.make_optimizer_scheduler(
                    self.reward_network.parameters()))
            optimizers.append(
                self.q_network_cpe_optimizer.make_optimizer_scheduler(
                    self.q_network_cpe.parameters()))

        # soft-update
        target_params = list(self.q_network_target.parameters())
        source_params = list(self.q_network.parameters())
        if self.calc_cpe_in_training:
            target_params += list(self.q_network_cpe_target.parameters())
            source_params += list(self.q_network_cpe.parameters())
        optimizers.append(
            SoftUpdate.make_optimizer_scheduler(target_params,
                                                source_params,
                                                tau=self.tau))

        return optimizers
Beispiel #2
0
    def configure_optimizers(self):
        optimizers = []

        optimizers.append(
            self.q_network_optimizer.make_optimizer(self.q1_network.parameters())
        )
        if self.q2_network:
            optimizers.append(
                self.q_network_optimizer.make_optimizer(self.q2_network.parameters())
            )
        optimizers.append(
            self.actor_network_optimizer.make_optimizer(self.actor_network.parameters())
        )
        if self.alpha_optimizer is not None:
            optimizers.append(self.alpha_optimizer.make_optimizer([self.log_alpha]))
        if self.value_network:
            optimizers.append(
                self.value_network_optimizer.make_optimizer(
                    self.value_network.parameters()
                )
            )
        # soft-update
        if self.value_network:
            target_params = self.value_network_target.parameters()
            source_params = self.value_network.parameters()
        else:
            target_params = list(self.q1_network_target.parameters())
            source_params = list(self.q1_network.parameters())
            if self.q2_network:
                target_params += list(self.q2_network_target.parameters())
                source_params += list(self.q2_network.parameters())
        optimizers.append(SoftUpdate(target_params, source_params, tau=self.tau))
        return optimizers
Beispiel #3
0
    def configure_optimizers(self):
        optimizers = []
        target_params = list(self.q_network_target.parameters())
        source_params = list(self.q_network.parameters())

        optimizers.append(
            self.q_network_optimizer.make_optimizer_scheduler(
                self.q_network.parameters()))

        if self.calc_cpe_in_training:
            (
                cpe_target_params,
                cpe_source_params,
                cpe_optimizers,
            ) = self._configure_cpe_optimizers()
            target_params += cpe_target_params
            source_params += cpe_source_params
            optimizers += cpe_optimizers

        optimizers.append(
            SoftUpdate.make_optimizer_scheduler(target_params,
                                                source_params,
                                                tau=self.tau))

        return optimizers
Beispiel #4
0
    def configure_optimizers(self):
        optimizers = []

        optimizers.append(
            self.q_network_optimizer.make_optimizer(
                self.q1_network.parameters()))
        if self.q2_network:
            optimizers.append(
                self.q_network_optimizer.make_optimizer(
                    self.q2_network.parameters()))
        optimizers.append(
            self.actor_network_optimizer.make_optimizer(
                self.actor_network.parameters()))

        # soft-update
        target_params = list(self.q1_network_target.parameters())
        source_params = list(self.q1_network.parameters())
        if self.q2_network:
            target_params += list(self.q2_network_target.parameters())
            source_params += list(self.q2_network.parameters())
        target_params += list(self.actor_network_target.parameters())
        source_params += list(self.actor_network.parameters())
        optimizers.append(
            SoftUpdate(target_params, source_params, tau=self.tau))
        return optimizers
Beispiel #5
0
 def configure_optimizers(self):
     optimizers = [
         self.q_network_optimizer.make_optimizer(self.q_network.parameters())
     ]
     # soft-update
     target_params = list(self.q_network_target.parameters())
     source_params = list(self.q_network.parameters())
     optimizers.append(SoftUpdate(target_params, source_params, tau=self.tau))
     return optimizers
    def configure_optimizers(self):
        optimizers = []

        optimizers.append(
            self.q_network_optimizer.make_optimizer_scheduler(
                self.q_network.parameters()
            )
        )

        target_params = list(self.q_network_target.parameters())
        source_params = list(self.q_network.parameters())
        optimizers.append(
            SoftUpdate.make_optimizer_scheduler(
                target_params, source_params, tau=self.tau
            )
        )

        return optimizers
Beispiel #7
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        q_network_cpe=None,
        q_network_cpe_target=None,
        metrics_to_score=None,
        imitator=None,
        loss_reporter=None,
        use_gpu: bool = False,
        actions: List[str] = field(default_factory=list),  # noqa: B008
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        bcq: Optional[BCQConfig] = None,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        evaluation: EvaluationParameters = field(  # noqa: B008
            default_factory=EvaluationParameters),
    ) -> None:
        super().__init__(
            rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=actions,
            evaluation_parameters=evaluation,
            loss_reporter=loss_reporter,
        )
        assert self._actions is not None, "Discrete-action DQN needs action names"
        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            q_network.parameters())

        self.q_network_soft_update = SoftUpdate(
            self.q_network_target.parameters(), self.q_network.parameters(),
            self.tau)

        self._initialize_cpe(reward_network,
                             q_network_cpe,
                             q_network_cpe_target,
                             optimizer=optimizer)

        # pyre-fixme[6]: Expected `Sized` for 1st param but got `Optional[List[str]]`.
        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if rl.reward_boost is not None:
            # pyre-fixme[16]: `Optional` has no attribute `keys`.
            for k in rl.reward_boost.keys():
                # pyre-fixme[16]: `Optional` has no attribute `index`.
                i = self._actions.index(k)
                # pyre-fixme[16]: `Optional` has no attribute `__getitem__`.
                self.reward_boosts[0, i] = rl.reward_boost[k]

        # Batch constrained q-learning
        self.bcq = bcq is not None
        if self.bcq:
            assert bcq is not None
            self.bcq_drop_threshold = bcq.drop_threshold
            self.bcq_imitator = imitator