Beispiel #1
0
    def __init__(
        self,
        qnet: Union[NeuralNetworkModule, nn.Module],
        qnet_target: Union[NeuralNetworkModule, nn.Module],
        optimizer: Callable,
        criterion: Callable,
        *_,
        lr_scheduler: Callable = None,
        lr_scheduler_args: Tuple[Tuple] = None,
        lr_scheduler_kwargs: Tuple[Dict] = None,
        batch_size: int = 100,
        epsilon_decay: float = 0.9999,
        update_rate: Union[float, None] = 0.005,
        update_steps: Union[int, None] = None,
        learning_rate: float = 0.001,
        discount: float = 0.99,
        gradient_max: float = np.inf,
        replay_size: int = 500000,
        replay_device: Union[str, t.device] = "cpu",
        replay_buffer: Buffer = None,
        mode: str = "double",
        visualize: bool = False,
        visualize_dir: str = "",
        **__,
    ):
        """
        Note:
            DQN is only available for discrete environments.

        Note:
            Dueling DQN is a network structure rather than a framework, so
            it could be applied to all three modes.

            If ``mode = "vanilla"``, implements the simplest online DQN,
            with replay buffer.

            If ``mode = "fixed_target"``, implements DQN with a target network,
            and replay buffer. Described in `this <https://web.stanford.\
edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf>`__ essay.

            If ``mode = "double"``, implements Double DQN described in
            `this <https://arxiv.org/pdf/1509.06461.pdf>`__ essay.

        Note:
            Vanilla DQN only needs one network, so internally, ``qnet``
            is assigned to ``qnet_target``.

        Note:
            In order to implement dueling DQN, you should create two dense
            output layers.

            In your q network::

                    self.fc_adv = nn.Linear(in_features=...,
                                            out_features=num_actions)
                    self.fc_val = nn.Linear(in_features=...,
                                            out_features=1)

            Then in your ``forward()`` method, you should implement output as::

                    adv = self.fc_adv(some_input)
                    val = self.fc_val(some_input).expand(self.batch_sze,
                                                         self.num_actions)
                    return val + adv - adv.mean(1, keepdim=True)

        Note:
            Your optimizer will be called as::

                optimizer(network.parameters(), learning_rate)

            Your lr_scheduler will be called as::

                lr_scheduler(
                    optimizer,
                    *lr_scheduler_args[0],
                    **lr_scheduler_kwargs[0],
                )

            Your criterion will be called as::

                criterion(
                    target_value.view(batch_size, 1),
                    predicted_value.view(batch_size, 1)
                )

        Note:
            DQN supports two ways of updating the target network, the first
            way is polyak update (soft update), which updates the target network
            in every training step by mixing its weights with the online network
            using ``update_rate``.

            The other way is hard update, which copies weights of the online
            network after every ``update_steps`` training step.

            You can either specify ``update_rate`` or ``update_steps`` to select
            one update scheme, if both are specified, an error will be raised.

            These two different update schemes may result in different training
            stability.

        Attributes:
            epsilon: Current epsilon value, determines randomness in
                ``act_discrete_with_noise``. You can set it to any value.

        Args:
            qnet: Q network module.
            qnet_target: Target Q network module.
            optimizer: Optimizer used to optimize ``qnet``.
            criterion: Criterion used to evaluate the value loss.
            learning_rate: Learning rate of the optimizer, not compatible with
                ``lr_scheduler``.
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            batch_size: Batch size used during training.
            epsilon_decay: Epsilon decay rate per acting with noise step.
                ``epsilon`` attribute is multiplied with this every time
                ``act_discrete_with_noise`` is called.
            update_rate: :math:`\\tau` used to update target networks.
                Target parameters are updated as:

                :math:`\\theta_t = \\theta * \\tau + \\theta_t * (1 - \\tau)`
            update_steps: Training step number used to update target networks.
            discount: :math:`\\gamma` used in the bellman function.
            gradient_max: Maximum gradient.
            replay_size: Replay buffer size. Not compatible with
                ``replay_buffer``.
            replay_device: Device where the replay buffer locates on, Not
                compatible with ``replay_buffer``.
            replay_buffer: Custom replay buffer.
            mode: one of ``"vanilla", "fixed_target", "double"``.
            visualize: Whether visualize the network flow in the first pass.
        """
        self.batch_size = batch_size
        self.epsilon_decay = epsilon_decay
        self.update_rate = update_rate
        self.update_steps = update_steps
        self.discount = discount
        self.grad_max = gradient_max
        self.visualize = visualize
        self.visualize_dir = visualize_dir
        self.mode = mode
        self.epsilon = 1
        self._update_counter = 0

        if mode not in {"vanilla", "fixed_target", "double"}:
            raise ValueError(f"Unknown DQN mode: {mode}")

        if update_rate is not None and update_steps is not None:
            raise ValueError("You can only specify one target network update"
                             " scheme, either by update_rate or update_steps,"
                             " but not both.")

        self.qnet = qnet
        if self.mode == "vanilla":
            self.qnet_target = qnet
        else:
            self.qnet_target = qnet_target
        self.qnet_optim = optimizer(self.qnet.parameters(), lr=learning_rate)
        self.replay_buffer = (Buffer(replay_size, replay_device)
                              if replay_buffer is None else replay_buffer)

        # Make sure target and online networks have the same weight
        with t.no_grad():
            hard_update(self.qnet, self.qnet_target)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), )
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, )
            self.qnet_lr_sch = lr_scheduler(self.qnet_optim,
                                            *lr_scheduler_args[0],
                                            **lr_scheduler_kwargs[0])

        self.criterion = criterion

        super().__init__()
Beispiel #2
0
    def __init__(self,
                 qnet: Union[NeuralNetworkModule, nn.Module],
                 qnet_target: Union[NeuralNetworkModule, nn.Module],
                 optimizer: Callable,
                 criterion: Callable,
                 *_,
                 lr_scheduler: Callable = None,
                 lr_scheduler_args: Tuple[Tuple] = None,
                 lr_scheduler_kwargs: Tuple[Dict] = None,
                 batch_size: int = 100,
                 update_rate: float = 0.005,
                 learning_rate: float = 0.001,
                 discount: float = 0.99,
                 gradient_max: float = np.inf,
                 replay_size: int = 500000,
                 replay_device: Union[str, t.device] = "cpu",
                 replay_buffer: Buffer = None,
                 mode: str = "double",
                 visualize: bool = False,
                 visualize_dir: str = "",
                 **__):
        """
        Note:
            DQN is only available for discrete environments.

        Note:
            Dueling DQN is a network structure rather than a framework, so
            it could be applied to all three modes.

            If ``mode = "vanilla"``, implements the simplest online DQN,
            with replay buffer.

            If ``mode = "fixed_target"``, implements DQN with a target network,
            and replay buffer. Described in `this <https://web.stanford.\
edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf>`__ essay.

            If ``mode = "double"``, implements Double DQN described in
            `this <https://arxiv.org/pdf/1509.06461.pdf>`__ essay.

        Note:
            Vanilla DQN only needs one network, so internally, ``qnet``
            is assigned to ``qnet_target``.

        Note:
            In order to implement dueling DQN, you should create two dense
            output layers.

            In your q network::

                    self.fc_adv = nn.Linear(in_features=...,
                                            out_features=num_actions)
                    self.fc_val = nn.Linear(in_features=...,
                                            out_features=1)

            Then in your ``forward()`` method, you should implement output as::

                    adv = self.fc_adv(some_input)
                    val = self.fc_val(some_input).expand(self.batch_sze,
                                                         self.num_actions)
                    return val + adv - adv.mean(1, keepdim=True)

        Note:
            Your optimizer will be called as::

                optimizer(network.parameters(), learning_rate)

            Your lr_scheduler will be called as::

                lr_scheduler(
                    optimizer,
                    *lr_scheduler_args[0],
                    **lr_scheduler_kwargs[0],
                )

            Your criterion will be called as::

                criterion(
                    target_value.view(batch_size, 1),
                    predicted_value.view(batch_size, 1)
                )

        Args:
            qnet: Q network module.
            qnet_target: Target Q network module.
            optimizer: Optimizer used to optimize ``qnet``.
            criterion: Criterion used to evaluate the value loss.
            learning_rate: Learning rate of the optimizer, not compatible with
                ``lr_scheduler``.
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            batch_size: Batch size used during training.
            update_rate: :math:`\\tau` used to update target networks.
                Target parameters are updated as:

                :math:`\\theta_t = \\theta * \\tau + \\theta_t * (1 - \\tau)`

            discount: :math:`\\gamma` used in the bellman function.
            replay_size: Replay buffer size. Not compatible with
                ``replay_buffer``.
            replay_device: Device where the replay buffer locates on, Not
                compatible with ``replay_buffer``.
            replay_buffer: Custom replay buffer.
            mode: one of ``"vanilla", "fixed_target", "double"``.
            visualize: Whether visualize the network flow in the first pass.
        """
        self.batch_size = batch_size
        self.update_rate = update_rate
        self.discount = discount
        self.grad_max = gradient_max
        self.visualize = visualize
        self.visualize_dir = visualize_dir

        if mode not in {"vanilla", "fixed_target", "double"}:
            raise ValueError("Unknown DQN mode: {}".format(mode))
        self.mode = mode

        self.qnet = qnet
        if self.mode == "vanilla":
            self.qnet_target = qnet
        else:
            self.qnet_target = qnet_target
        self.qnet_optim = optimizer(self.qnet.parameters(), lr=learning_rate)
        self.replay_buffer = (Buffer(replay_size, replay_device)
                              if replay_buffer is None else replay_buffer)

        # Make sure target and online networks have the same weight
        with t.no_grad():
            hard_update(self.qnet, self.qnet_target)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), )
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, )
            self.qnet_lr_sch = lr_scheduler(self.qnet_optim,
                                            *lr_scheduler_args[0],
                                            **lr_scheduler_kwargs[0])

        self.criterion = criterion

        super(DQN, self).__init__()
Beispiel #3
0
    def __init__(self,
                 actor: Union[NeuralNetworkModule, nn.Module],
                 actor_target: Union[NeuralNetworkModule, nn.Module],
                 critic: Union[NeuralNetworkModule, nn.Module],
                 critic_target: Union[NeuralNetworkModule, nn.Module],
                 optimizer: Callable,
                 criterion: Callable,
                 *_,
                 lr_scheduler: Callable = None,
                 lr_scheduler_args: Tuple[Tuple, Tuple] = None,
                 lr_scheduler_kwargs: Tuple[Dict, Dict] = None,
                 batch_size: int = 100,
                 update_rate: float = 0.001,
                 update_steps: Union[int, None] = None,
                 actor_learning_rate: float = 0.0005,
                 critic_learning_rate: float = 0.001,
                 discount: float = 0.99,
                 gradient_max: float = np.inf,
                 replay_size: int = 500000,
                 replay_device: Union[str, t.device] = "cpu",
                 replay_buffer: Buffer = None,
                 visualize: bool = False,
                 visualize_dir: str = "",
                 **__):
        """
        Note:
            Your optimizer will be called as::

                optimizer(network.parameters(), learning_rate)

            Your lr_scheduler will be called as::

                lr_scheduler(
                    optimizer,
                    *lr_scheduler_args[0],
                    **lr_scheduler_kwargs[0],
                )

            Your criterion will be called as::

                criterion(
                    target_value.view(batch_size, 1),
                    predicted_value.view(batch_size, 1)
                )

        Note:
            DDPG supports two ways of updating the target network, the first
            way is polyak update (soft update), which updates the target network
            in every training step by mixing its weights with the online network
            using ``update_rate``.

            The other way is hard update, which copies weights of the online
            network after every ``update_steps`` training step.

            You can either specify ``update_rate`` or ``update_steps`` to select
            one update scheme, if both are specified, an error will be raised.

            These two different update schemes may result in different training
            stability.

        Args:
            actor: Actor network module.
            actor_target: Target actor network module.
            critic: Critic network module.
            critic_target: Target critic network module.
            optimizer: Optimizer used to optimize ``actor`` and ``critic``.
            criterion: Criterion used to evaluate the value loss.
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            batch_size: Batch size used during training.
            update_rate: :math:`\\tau` used to update target networks.
                Target parameters are updated as:

                :math:`\\theta_t = \\theta * \\tau + \\theta_t * (1 - \\tau)`
            update_steps: Training step number used to update target networks.
            actor_learning_rate: Learning rate of the actor optimizer,
                not compatible with ``lr_scheduler``.
            critic_learning_rate: Learning rate of the critic optimizer,
                not compatible with ``lr_scheduler``.
            discount: :math:`\\gamma` used in the bellman function.
            replay_size: Replay buffer size. Not compatible with
                ``replay_buffer``.
            replay_device: Device where the replay buffer locates on, Not
                compatible with ``replay_buffer``.
            replay_buffer: Custom replay buffer.
            visualize: Whether visualize the network flow in the first pass.
            visualize_dir: Visualized graph save directory.
        """
        self.batch_size = batch_size
        self.update_rate = update_rate
        self.update_steps = update_steps
        self.discount = discount
        self.gradient_max = gradient_max
        self.visualize = visualize
        self.visualize_dir = visualize_dir
        self._update_counter = 0

        if update_rate is not None and update_steps is not None:
            raise ValueError("You can only specify one target network update"
                             " scheme, either by update_rate or update_steps,"
                             " but not both.")

        self.actor = actor
        self.actor_target = actor_target
        self.critic = critic
        self.critic_target = critic_target
        self.actor_optim = optimizer(self.actor.parameters(),
                                     lr=actor_learning_rate)
        self.critic_optim = optimizer(self.critic.parameters(),
                                      lr=critic_learning_rate)
        self.replay_buffer = (Buffer(replay_size, replay_device)
                              if replay_buffer is None else replay_buffer)

        # Make sure target and online networks have the same weight
        with t.no_grad():
            hard_update(self.actor, self.actor_target)
            hard_update(self.critic, self.critic_target)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), ())
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, {})
            self.actor_lr_sch = lr_scheduler(self.actor_optim,
                                             *lr_scheduler_args[0],
                                             **lr_scheduler_kwargs[0])
            self.critic_lr_sch = lr_scheduler(self.critic_optim,
                                              *lr_scheduler_args[1],
                                              **lr_scheduler_kwargs[1])

        self.criterion = criterion
        super().__init__()
Beispiel #4
0
    def __init__(self,
                 actor: Union[NeuralNetworkModule, nn.Module],
                 actor_target: Union[NeuralNetworkModule, nn.Module],
                 critic: Union[NeuralNetworkModule, nn.Module],
                 critic_target: Union[NeuralNetworkModule, nn.Module],
                 optimizer: Callable,
                 criterion: Callable,
                 *_,
                 lr_scheduler: Callable = None,
                 lr_scheduler_args: Tuple[Tuple, Tuple] = None,
                 lr_scheduler_kwargs: Tuple[Dict, Dict] = None,
                 batch_size: int = 100,
                 update_rate: float = 0.001,
                 actor_learning_rate: float = 0.0005,
                 critic_learning_rate: float = 0.001,
                 discount: float = 0.99,
                 gradient_max: float = np.inf,
                 replay_size: int = 500000,
                 replay_device: Union[str, t.device] = "cpu",
                 replay_buffer: Buffer = None,
                 visualize: bool = False,
                 visualize_dir: str = "",
                 **__):
        """
        Note:
            Your optimizer will be called as::

                optimizer(network.parameters(), learning_rate)

            Your lr_scheduler will be called as::

                lr_scheduler(
                    optimizer,
                    *lr_scheduler_args[0],
                    **lr_scheduler_kwargs[0],
                )

            Your criterion will be called as::

                criterion(
                    target_value.view(batch_size, 1),
                    predicted_value.view(batch_size, 1)
                )

        Args:
            actor: Actor network module.
            actor_target: Target actor network module.
            critic: Critic network module.
            critic_target: Target critic network module.
            optimizer: Optimizer used to optimize ``actor`` and ``critic``.
            criterion: Criterion used to evaluate the value loss.
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            batch_size: Batch size used during training.
            update_rate: :math:`\\tau` used to update target networks.
                Target parameters are updated as:

                :math:`\\theta_t = \\theta * \\tau + \\theta_t * (1 - \\tau)`
            actor_learning_rate: Learning rate of the actor optimizer,
                not compatible with ``lr_scheduler``.
            critic_learning_rate: Learning rate of the critic optimizer,
                not compatible with ``lr_scheduler``.
            discount: :math:`\\gamma` used in the bellman function.
            replay_size: Replay buffer size. Not compatible with
                ``replay_buffer``.
            replay_device: Device where the replay buffer locates on, Not
                compatible with ``replay_buffer``.
            replay_buffer: Custom replay buffer.
            visualize: Whether visualize the network flow in the first pass.
            visualize_dir: Visualized graph save directory.
        """
        self.batch_size = batch_size
        self.update_rate = update_rate
        self.discount = discount
        self.grad_max = gradient_max
        self.visualize = visualize
        self.visualize_dir = visualize_dir

        self.actor = actor
        self.actor_target = actor_target
        self.critic = critic
        self.critic_target = critic_target
        self.actor_optim = optimizer(self.actor.parameters(),
                                     lr=actor_learning_rate)
        self.critic_optim = optimizer(self.critic.parameters(),
                                      lr=critic_learning_rate)
        self.replay_buffer = (Buffer(replay_size, replay_device)
                              if replay_buffer is None else replay_buffer)

        # Make sure target and online networks have the same weight
        with t.no_grad():
            hard_update(self.actor, self.actor_target)
            hard_update(self.critic, self.critic_target)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), ())
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, {})
            self.actor_lr_sch = lr_scheduler(self.actor_optim,
                                             *lr_scheduler_args[0],
                                             **lr_scheduler_kwargs[0])
            self.critic_lr_sch = lr_scheduler(self.critic_optim,
                                              *lr_scheduler_args[1],
                                              **lr_scheduler_kwargs[1])

        self.criterion = criterion
        super(DDPG, self).__init__()
Beispiel #5
0
    def __init__(
        self,
        discriminator: Union[NeuralNetworkModule, nn.Module],
        constrained_policy_optimization: Union[PPO, TRPO],
        optimizer: Callable,
        *_,
        lr_scheduler: Callable = None,
        lr_scheduler_args: Tuple[Tuple] = None,
        lr_scheduler_kwargs: Tuple[Dict] = None,
        batch_size: int = 100,
        discriminator_update_times: int = 1,
        discriminator_learning_rate: float = 0.001,
        gradient_max: float = np.inf,
        expert_replay_size: int = 500000,
        expert_replay_device: Union[str, t.device] = "cpu",
        expert_replay_buffer: Buffer = None,
        visualize: bool = False,
        visualize_dir: str = "",
        **__,
    ):
        """
        Note:
            The forward method of your discriminator network must take two arguments::
                
                def forward(self,
                            state: Dict[str, t.Tensor],
                            action: Dict[str, t.Tensor])

            And return a tag vector (float type) of size ``[batch_size, 1]``, usually
            you can do this by using a sigmoid output layer.

            If you set ``concatenate_samples`` to ``False`` during the ``update()``
            call, then you should expect ``Dict[str, List[t.Tensor]]``.

        Note:
            You can access the following attributes:

                1. ``actor``
                2. ``critic``
                3. ``actor_optim``
                4. ``critic_optim``
                5. ``actor_lr_sch`` (may not exist if you are not using lr scheduler)
                6. ``critic_lr_sch`` (may not exist if you are not using lr scheduler)
                7. ``replay_buffer``

            of the input PPO or TRPO framework directly from an GAIL instance,
            since they are forwarded to here.

            For other attributes, you need to manually access them from the
            ``constrained_policy_optimization`` attribute.

        Args:
            discriminator: Discriminator network module.
            constrained_policy_optimization: A constrained policy optimization
                framework, currently can be a :class:`.PPO` or :class:`TRPO` framework.
            optimizer: Optimizer used to optimize ``discriminator``.
            discriminator_learning_rate: Learning rate of the discriminator optimizer,
                not compatible with ``lr_scheduler``.
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            batch_size: Batch size used during discriminator training.
            gradient_max: Maximum gradient.
            expert_replay_size: Expert trajectory buffer size. Not compatible with
                ``expert_replay_buffer``.
            expert_replay_device: Device where the expert replay buffer locates on, Not
                compatible with ``expert_replay_buffer``.
            expert_replay_buffer: Custom expert replay buffer.
            visualize: Whether visualize the network flow in the first pass.
            visualize_dir: Visualized graph save directory.
        """
        self.batch_size = batch_size
        self.gradient_max = gradient_max
        self.visualize = visualize
        self.visualize_dir = visualize_dir

        self.constrained_policy_optimization = constrained_policy_optimization
        self.actor = constrained_policy_optimization.actor
        self.critic = constrained_policy_optimization.critic
        self.actor_optim = constrained_policy_optimization.actor_optim
        self.critic_optim = constrained_policy_optimization.critic_optim
        if hasattr(constrained_policy_optimization, "actor_lr_sch"):
            self.actor_lr_sch = constrained_policy_optimization.actor_lr_sch
        if hasattr(constrained_policy_optimization, "critic_lr_sch"):
            self.critic_lr_sch = constrained_policy_optimization.critic_lr_sch
        self.replay_buffer = constrained_policy_optimization.replay_buffer

        self.discriminator = discriminator
        # By default it is BCELoss, you can modify this attribute.
        self.discriminator_criterion = nn.BCELoss()
        self.discriminator_update_times = discriminator_update_times
        self.discriminator_optim = optimizer(self.actor.parameters(),
                                             lr=discriminator_learning_rate)
        self.expert_replay_buffer = (
            Buffer(expert_replay_size, expert_replay_device)
            if expert_replay_buffer is None else expert_replay_buffer)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), )
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, )
            self.discriminator_lr_sch = lr_scheduler(
                self.discriminator_optim,
                *lr_scheduler_args[0],
                **lr_scheduler_kwargs[0],
            )

        self.bce_criterion = nn.BCELoss()
        super().__init__()
Beispiel #6
0
    def __init__(self,
                 actor: Union[NeuralNetworkModule, nn.Module],
                 critic: Union[NeuralNetworkModule, nn.Module],
                 optimizer: Callable,
                 criterion: Callable,
                 *_,
                 lr_scheduler: Callable = None,
                 lr_scheduler_args: Tuple[Tuple, Tuple] = None,
                 lr_scheduler_kwargs: Tuple[Dict, Dict] = None,
                 batch_size: int = 100,
                 actor_update_times: int = 5,
                 critic_update_times: int = 10,
                 actor_learning_rate: float = 0.001,
                 critic_learning_rate: float = 0.001,
                 entropy_weight: float = None,
                 value_weight: float = 0.5,
                 gradient_max: float = np.inf,
                 gae_lambda: float = 1.0,
                 discount: float = 0.99,
                 normalize_advantage: bool = True,
                 replay_size: int = 500000,
                 replay_device: Union[str, t.device] = "cpu",
                 replay_buffer: Buffer = None,
                 visualize: bool = False,
                 visualize_dir: str = "",
                 **__):
        """
        Important:
            When given a state, and an optional action, actor must
            at least return two values:

            **1. Action**

              For **contiguous environments**, action must be of shape
              ``[batch_size, action_dim]`` and *clamped by action space*.
              For **discrete environments**, action could be of shape
              ``[batch_size, action_dim]`` if it is a one hot vector, or
              ``[batch_size, 1]`` or [batch_size] if it is a categorically
              encoded integer.

              When the given action is not None, actor must return the given
              action.

            **2. Log likelihood of action (action probability)**

              For either type of environment, log likelihood is of shape
              ``[batch_size, 1]`` or ``[batch_size]``.

              Action probability must be differentiable, Gradient of actor
              is calculated from the gradient of action probability.

              When the given action is not None, actor must return the log
              likelihood of the given action.

            The third entropy value is optional:

            **3. Entropy of action distribution**

              Entropy is usually calculated using dist.entropy(), its shape
              is ``[batch_size, 1]`` or ``[batch_size]``. You must specify
              ``entropy_weight`` to make it effective.

        Hint:
            For contiguous environments, action's are not directly output by
            your actor, otherwise it would be rather inconvenient to calculate
            the log probability of action. Instead, your actor network should
            output parameters for a certain distribution
            (eg: :class:`~torch.distributions.categorical.Normal`)
            and then draw action from it.

            For discrete environments,
            :class:`~torch.distributions.categorical.Categorical` is sufficient,
            since differentiable ``rsample()`` is not needed.

            This trick is also known as **reparameterization**.

        Hint:
            Actions are from samples during training in the actor critic
            family (A2C, A3C, PPO, TRPO, IMPALA).

            When your actor model is given a batch of actions and states, it
            must evaluate the states, and return the log likelihood of the
            given actions instead of re-sampling actions.

            An example of your actor in contiguous environments::

                class ActorNet(nn.Module):
                    def __init__(self):
                        super(ActorNet, self).__init__()
                        self.fc = nn.Linear(3, 100)
                        self.mu_head = nn.Linear(100, 1)
                        self.sigma_head = nn.Linear(100, 1)

                    def forward(self, state, action=None):
                        x = t.relu(self.fc(state))
                        mu = 2.0 * t.tanh(self.mu_head(x))
                        sigma = F.softplus(self.sigma_head(x))
                        dist = Normal(mu, sigma)
                        action = (action
                                  if action is not None
                                  else dist.sample())
                        action_entropy = dist.entropy()
                        action = action.clamp(-2.0, 2.0)

                        # Since we are representing a multivariate gaussian
                        # distribution in terms of independent univariate gaussians:
                        action_log_prob = dist.log_prob(action).sum(
                            dim=1, keepdim=True
                        )
                        return action, action_log_prob, action_entropy

        Hint:
            Entropy weight is usually negative, to increase exploration.

            Value weight is usually 0.5. So critic network converges less
            slowly than the actor network and learns more conditions.

            Update equation is equivalent to:

            :math:`Loss= w_e * Entropy + w_v * Loss_v + w_a * Loss_a`
            :math:`Loss_a = -log\\_likelihood * advantage`
            :math:`Loss_v = criterion(target\\_bellman\\_value - V(s))`

        Args:
            actor: Actor network module.
            critic: Critic network module.
            optimizer: Optimizer used to optimize ``actor`` and ``critic``.
            criterion: Criterion used to evaluate the value loss.
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            batch_size: Batch size used during training.
            actor_update_times: Times to update actor in ``update()``.
            critic_update_times: Times to update critic in ``update()``.
            actor_learning_rate: Learning rate of the actor optimizer,
                not compatible with ``lr_scheduler``.
            critic_learning_rate: Learning rate of the critic optimizer,
                not compatible with ``lr_scheduler``.
            entropy_weight: Weight of entropy in your loss function, a positive
                entropy weight will minimize entropy, while a negative one will
                maximize entropy.
            value_weight: Weight of critic value loss.
            gradient_max: Maximum gradient.
            gae_lambda: :math:`\\lambda` used in generalized advantage
                estimation.
            discount: :math:`\\gamma` used in the bellman function.
            normalize_advantage: Whether to normalize sampled advantage values in
                the batch.
            replay_size: Replay buffer size. Not compatible with
                ``replay_buffer``.
            replay_device: Device where the replay buffer locates on, Not
                compatible with ``replay_buffer``.
            replay_buffer: Custom replay buffer.
            visualize: Whether visualize the network flow in the first pass.
            visualize_dir: Visualized graph save directory.
        """
        self.batch_size = batch_size
        self.actor_update_times = actor_update_times
        self.critic_update_times = critic_update_times
        self.discount = discount
        self.value_weight = value_weight
        self.entropy_weight = entropy_weight
        self.gradient_max = gradient_max
        self.gae_lambda = gae_lambda
        self.normalize_advantage = normalize_advantage
        self.visualize = visualize
        self.visualize_dir = visualize_dir

        self.actor = actor
        self.critic = critic
        self.actor_optim = optimizer(self.actor.parameters(),
                                     lr=actor_learning_rate)
        self.critic_optim = optimizer(self.critic.parameters(),
                                      lr=critic_learning_rate)
        self.replay_buffer = (Buffer(replay_size, replay_device)
                              if replay_buffer is None else replay_buffer)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), ())
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, {})
            self.actor_lr_sch = lr_scheduler(
                self.actor_optim,
                *lr_scheduler_args[0],
                **lr_scheduler_kwargs[0],
            )
            self.critic_lr_sch = lr_scheduler(self.critic_optim,
                                              *lr_scheduler_args[1],
                                              **lr_scheduler_kwargs[1])

        self.criterion = criterion

        super().__init__()
Beispiel #7
0
    def __init__(
        self,
        actor: Union[NeuralNetworkModule, nn.Module],
        critic: Union[NeuralNetworkModule, nn.Module],
        critic_target: Union[NeuralNetworkModule, nn.Module],
        critic2: Union[NeuralNetworkModule, nn.Module],
        critic2_target: Union[NeuralNetworkModule, nn.Module],
        optimizer: Callable,
        criterion: Callable,
        *_,
        lr_scheduler: Callable = None,
        lr_scheduler_args: Tuple[Tuple, Tuple, Tuple] = None,
        lr_scheduler_kwargs: Tuple[Dict, Dict, Dict] = None,
        target_entropy: float = None,
        initial_entropy_alpha: float = 1.0,
        batch_size: int = 100,
        update_rate: float = 0.005,
        update_steps: Union[int, None] = None,
        actor_learning_rate: float = 0.0005,
        critic_learning_rate: float = 0.001,
        alpha_learning_rate: float = 0.001,
        discount: float = 0.99,
        gradient_max: float = np.inf,
        replay_size: int = 500000,
        replay_device: Union[str, t.device] = "cpu",
        replay_buffer: Buffer = None,
        visualize: bool = False,
        visualize_dir: str = "",
        **__
    ):
        """
        See Also:
            :class:`.A2C`
            :class:`.DDPG`

        Important:
            When given a state, and an optional action, actor must
            at least return two values, similar to the actor structure
            described in :class:`.A2C`. However, when actor is asked to
            select an action based on the current state, you must make
            sure that the sampling process is **differentiable**. E.g.
            use the ``rsample`` method of torch distributions instead
            of the ``sample`` method.

            Compared to other actor-critic methods, SAC embeds the
            entropy term into its reward function directly, rather than adding
            the entropy term to actor's loss function. Therefore, we do not use
            the entropy output of your actor network.

            The SAC algorithm uses Q network as critics, so please reference
            :class:`.DDPG` for the requirements and the definition of
            ``action_trans_func``.

        Args:
            actor: Actor network module.
            critic: Critic network module.
            critic_target: Target critic network module.
            critic2: The second critic network module.
            critic2_target: The second target critic network module.
            optimizer: Optimizer used to optimize ``actor``, ``critic`` and
                ``critic2``.
            criterion: Criterion used to evaluate the value loss.
            *_:
            lr_scheduler: Learning rate scheduler of ``optimizer``.
            lr_scheduler_args: Arguments of the learning rate scheduler.
            lr_scheduler_kwargs: Keyword arguments of the learning
                rate scheduler.
            target_entropy: Target entropy weight :math:`\\alpha` used in
                the SAC soft value function:
                :math:`V_{soft}(s_t) = \\mathbb{E}_{q_t\\sim\\pi}[\
                                        Q_{soft}(s_t,a_t) - \
                                        \\alpha log\\pi(a_t|s_t)]`
            initial_entropy_alpha: Initial entropy weight :math:`\\alpha`
            gradient_max: Maximum gradient.
            batch_size: Batch size used during training.
            update_rate: :math:`\\tau` used to update target networks.
                Target parameters are updated as:

                :math:`\\theta_t = \\theta * \\tau + \\theta_t * (1 - \\tau)`
            update_steps: Training step number used to update target networks.
            actor_learning_rate: Learning rate of the actor optimizer,
                not compatible with ``lr_scheduler``.
            critic_learning_rate: Learning rate of the critic optimizer,
                not compatible with ``lr_scheduler``.
            discount: :math:`\\gamma` used in the bellman function.
            replay_size: Replay buffer size. Not compatible with
                ``replay_buffer``.
            replay_device: Device where the replay buffer locates on, Not
                compatible with ``replay_buffer``.
            replay_buffer: Custom replay buffer.
            visualize: Whether visualize the network flow in the first pass.
            visualize_dir: Visualized graph save directory.
        """
        self.batch_size = batch_size
        self.update_rate = update_rate
        self.update_steps = update_steps
        self.discount = discount
        self.visualize = visualize
        self.visualize_dir = visualize_dir
        self.entropy_alpha = t.tensor([initial_entropy_alpha], requires_grad=True)
        self.grad_max = gradient_max
        self.target_entropy = target_entropy
        self._update_counter = 0

        if update_rate is not None and update_steps is not None:
            raise ValueError(
                "You can only specify one target network update"
                " scheme, either by update_rate or update_steps,"
                " but not both."
            )

        self.actor = actor
        self.critic = critic
        self.critic_target = critic_target
        self.critic2 = critic2
        self.critic2_target = critic2_target
        self.actor_optim = optimizer(self.actor.parameters(), lr=actor_learning_rate)
        self.critic_optim = optimizer(self.critic.parameters(), lr=critic_learning_rate)
        self.critic2_optim = optimizer(
            self.critic2.parameters(), lr=critic_learning_rate
        )
        self.alpha_optim = optimizer([self.entropy_alpha], lr=alpha_learning_rate)
        self.replay_buffer = (
            Buffer(replay_size, replay_device)
            if replay_buffer is None
            else replay_buffer
        )

        # Make sure target and online networks have the same weight
        with t.no_grad():
            hard_update(self.critic, self.critic_target)
            hard_update(self.critic2, self.critic2_target)

        if lr_scheduler is not None:
            if lr_scheduler_args is None:
                lr_scheduler_args = ((), (), ())
            if lr_scheduler_kwargs is None:
                lr_scheduler_kwargs = ({}, {}, {})
            self.actor_lr_sch = lr_scheduler(
                self.actor_optim, *lr_scheduler_args[0], **lr_scheduler_kwargs[0],
            )
            self.critic_lr_sch = lr_scheduler(
                self.critic_optim, *lr_scheduler_args[1], **lr_scheduler_kwargs[1]
            )
            self.critic2_lr_sch = lr_scheduler(
                self.critic2_optim, *lr_scheduler_args[1], **lr_scheduler_kwargs[1]
            )
            self.alpha_lr_sch = lr_scheduler(
                self.alpha_optim, *lr_scheduler_args[2], **lr_scheduler_kwargs[2]
            )

        self.criterion = criterion
        super().__init__()