Example #1
0
 def _update_learning_rate(
     self,
     optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer],
 ) -> None:
     super(PPG, self)._update_learning_rate(optimizers)
     logger.record("train/aux_learning_rate",
                   self.aux_lr_schedule(self._current_progress_remaining))
     update_learning_rate(
         self.policy.aux_optimizer,
         self.aux_lr_schedule(self._current_progress_remaining))
Example #2
0
    def _update_learning_rate(self, optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer]) -> None:
        """
        Update the optimizers learning rate using the current learning rate schedule
        and the current progress (from 1 to 0).

        :param optimizers: (Union[List[th.optim.Optimizer], th.optim.Optimizer])
            An optimizer or a list of optimizers.
        """
        # Log the current learning rate
        self.logger.logkv("learning_rate", self.lr_schedule(self._current_progress))

        if not isinstance(optimizers, list):
            optimizers = [optimizers]
        for optimizer in optimizers:
            update_learning_rate(optimizer, self.lr_schedule(self._current_progress))
Example #3
0
    def _update_critic_learning_rate(
        self, optimizers: Union[List[th.optim.Optimizer],
                                th.optim.Optimizer]) -> None:
        """
        Update the optimizers learning rate using the current learning rate schedule
        and the current progress remaining (from 1 to 0).

        :param optimizers:
            An optimizer or a list of optimizers.
        """
        # Log the current learning rate
        logger.record(
            "train/learning_rate_critic",
            self.lr_schedule_critic(self._current_progress_remaining))

        if not isinstance(optimizers, list):
            optimizers = [optimizers]
        for optimizer in optimizers:
            update_learning_rate(
                optimizer,
                self.lr_schedule_critic(self._current_progress_remaining))
Example #4
0
    def train_mer(self, gradient_steps: int, batch_size: int = 64) -> None:
        optimizers = [self.policy.optimizer]

        # Reset optimizers:
        if self.reset_optimizers_during_training:
            for i_optimizer, optimizer in enumerate(optimizers):
                optimizer.__init__(optimizer.param_groups[0]['params'])
                optimizers[i_optimizer] = optimizer

        # Update learning rate according to schedule
        base_lr = self.lr_schedule(self._current_progress_remaining)

        # Save initial weights of model
        models_to_update = [self.policy]
        initial_state_dicts = [
            deepcopy(model.state_dict()) for model in models_to_update
        ]

        losses = []
        current_example_ind = randint(0, gradient_steps - 1)
        for gradient_step in range(gradient_steps):
            # Sample replay buffer or current example, and update learning rate accordingly
            if gradient_step == current_example_ind:
                replay_data = self.current_experience_buffer.sample(
                    1, env=self._vec_normalize_env)
                for optimizer in optimizers:
                    update_learning_rate(optimizer, base_lr * self.mer_s)
            else:
                replay_data = self.replay_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                for optimizer in optimizers:
                    update_learning_rate(optimizer, base_lr)

            with th.no_grad():
                # Compute the target Q values
                target_q = self.q_net_target(replay_data.next_observations)
                # Follow greedy policy: use the one with the highest value
                target_q, _ = target_q.max(dim=1)
                # Avoid potential broadcast issue
                target_q = target_q.reshape(-1, 1)
                # 1-step TD target
                target_q = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q

            # Get current Q estimates
            current_q = self.q_net(replay_data.observations)

            # Retrieve the q-values for the actions from the replay buffer
            current_q = th.gather(current_q,
                                  dim=1,
                                  index=replay_data.actions.long())

            # Compute Huber loss (less sensitive to outliers)
            loss = F.mse_loss(current_q, target_q)
            losses.append(loss.item())

            # Optimize the policy
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                        self.max_grad_norm)
            self.policy.optimizer.step()

        # Perform Reptile step
        for i_model, model in enumerate(models_to_update):
            self.reptile_step_state_dict(model, initial_state_dicts[i_model])

        # Increase update counter
        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/loss", np.mean(losses))
Example #5
0
    def train_mer(self, gradient_steps: int, batch_size: int = 64) -> None:
        optimizers = [self.actor.optimizer, self.critic.optimizer]
        if self.ent_coef_optimizer is not None:
            optimizers += [self.ent_coef_optimizer]

        optimizers_to_reset = optimizers

        # Reset optimizers:
        for i_optimizer, optimizer in enumerate(optimizers_to_reset):
            optimizer.__init__(optimizer.param_groups[0]['params'])
            optimizers[i_optimizer] = optimizer

        # Update optimizers learning rate
        base_lr = self.lr_schedule(self._current_progress_remaining)

        ent_coef_losses, ent_coefs = [], []
        actor_losses, critic_losses = [], []

        # Save initial weights of model - for actor and critic but not for critic_target
        models_to_update = [self.actor, self.critic]
        initial_state_dicts = [
            deepcopy(model.state_dict()) for model in models_to_update
        ]
        initial_log_ent_coef = self.log_ent_coef.detach().clone()

        current_example_ind = randint(0, gradient_steps - 1)
        for gradient_step in range(gradient_steps):
            # Sample replay buffer or current example, and update learning rate accordingly
            if gradient_step == current_example_ind:
                replay_data = self.current_experience_buffer.sample(
                    1, env=self._vec_normalize_env)
                for optimizer in optimizers:
                    update_learning_rate(optimizer, base_lr * self.mer_s)
            else:
                replay_data = self.replay_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                for optimizer in optimizers:
                    update_learning_rate(optimizer, base_lr)

            # We need to sample because `log_std` may have changed between two gradient steps
            if self.use_sde:
                self.actor.reset_noise()

            # Action by the current actor for the sampled state
            actions_pi, log_prob = self.actor.action_log_prob(
                replay_data.observations)
            log_prob = log_prob.reshape(-1, 1)

            ent_coef_loss = None
            if self.ent_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = th.exp(self.log_ent_coef.detach())
                ent_coef_loss = -(
                    self.log_ent_coef *
                    (log_prob + self.target_entropy).detach()).mean()
                ent_coef_losses.append(ent_coef_loss.item())
            else:
                ent_coef = self.ent_coef_tensor

            ent_coefs.append(ent_coef.item())

            # Optimize entropy coefficient, also called
            # entropy temperature or alpha in the paper
            if ent_coef_loss is not None:
                self.ent_coef_optimizer.zero_grad()
                ent_coef_loss.backward()
                self.ent_coef_optimizer.step()

            with th.no_grad():
                # Select action according to policy
                next_actions, next_log_prob = self.actor.action_log_prob(
                    replay_data.next_observations)
                # Compute the target Q value: min over all critics targets
                targets = th.cat(self.critic_target(
                    replay_data.next_observations, next_actions),
                                 dim=1)
                target_q, _ = th.min(targets, dim=1, keepdim=True)
                # add entropy term
                target_q = target_q - ent_coef * next_log_prob.reshape(-1, 1)
                # td error + entropy term
                q_backup = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q

            # Get current Q estimates for each critic network
            # using action from the replay buffer
            current_q_estimates = self.critic(replay_data.observations,
                                              replay_data.actions)

            # Compute critic loss
            critic_loss = 0.5 * sum([
                F.mse_loss(current_q, q_backup)
                for current_q in current_q_estimates
            ])
            critic_losses.append(critic_loss.item())

            # Optimize the critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Compute actor loss
            # Alternative: actor_loss = th.mean(log_prob - qf1_pi)
            # Mean over all critic networks
            q_values_pi = th.cat(self.critic.forward(replay_data.observations,
                                                     actions_pi),
                                 dim=1)
            min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True)
            actor_loss = (ent_coef * log_prob - min_qf_pi).mean()
            actor_losses.append(actor_loss.item())

            # Optimize the actor
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

        # Perform Reptile step
        for i_model, model in enumerate(models_to_update):
            self.reptile_step_state_dict(model, initial_state_dicts[i_model])
        self.log_ent_coef.data = self.reptile_step_tensor(
            self.log_ent_coef.data, initial_log_ent_coef.data)

        # Update target networks
        polyak_update(self.critic.parameters(),
                      self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/ent_coef", np.mean(ent_coefs))
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        if len(ent_coef_losses) > 0:
            logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))