Esempio n. 1
0
def test_polyak():
    param1, param2 = th.nn.Parameter(th.ones((5, 5))), th.nn.Parameter(th.zeros((5, 5)))
    target1, target2 = th.nn.Parameter(th.ones((5, 5))), th.nn.Parameter(th.zeros((5, 5)))
    tau = 0.1
    polyak_update([param1], [param2], tau)
    with th.no_grad():
        for param, target_param in zip([target1], [target2]):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    assert th.allclose(param1, target1)
    assert th.allclose(param2, target2)
Esempio n. 2
0
    def _on_step(self) -> None:
        """
        Update the exploration rate and target network if needed.
        This method is called in ``collect_rollouts()`` after each step in the environment.
        """
        if self.num_timesteps % self.target_update_interval == 0:
            polyak_update(self.q_net.parameters(),
                          self.q_net_target.parameters(), self.tau)

        self.exploration_rate = self.exploration_schedule(
            self._current_progress_remaining)
        logger.record("rollout/exploration rate", self.exploration_rate)
Esempio n. 3
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:

        # Update learning rate according to lr schedule
        self._update_learning_rate([self.actor.optimizer, self.critic.optimizer])

        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):

            # Sample replay buffer
            replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)

            with th.no_grad():
                # Select action according to policy and add clipped noise
                noise = replay_data.actions.clone().data.normal_(0, self.target_policy_noise)
                noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip)
                next_actions = (self.actor_target(replay_data.next_observations) + noise).clamp(-1, 1)

                # Compute the target Q value: min over all critics targets
                targets = th.cat(self.critic_target(replay_data.next_observations, next_actions), dim=1)
                target_q, _ = th.min(targets, dim=1, keepdim=True)
                target_q = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q

            # Get current Q estimates for each critic network
            current_q_estimates = self.critic(replay_data.observations, replay_data.actions)

            # Compute critic loss
            critic_loss = sum([F.mse_loss(current_q, target_q) for current_q in current_q_estimates])
            critic_losses.append(critic_loss.item())

            # Optimize the critics
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Delayed policy updates
            if gradient_step % self.policy_delay == 0:
                # Compute actor loss
                actor_loss = -self.critic.q1_forward(replay_data.observations, self.actor(replay_data.observations)).mean()
                actor_losses.append(actor_loss.item())

                # Optimize the actor
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()

                polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau)
                polyak_update(self.actor.parameters(), self.actor_target.parameters(), self.tau)

        self._n_updates += gradient_steps
        logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
Esempio n. 4
0
    def learn_step(self, idxs, transition_batch, weights):
        Otm1, old_action, env_rew, done, Ot = transition_batch
        batch_size = len(Ot)
        observations = (torch.tensor(Otm1, device=self.device))
        actions = torch.tensor(old_action, device=self.device)
        rewards = torch.tensor(env_rew, device=self.device)
        done = torch.tensor(done, device=self.device).float().to(self.device)
        next_observations = (torch.tensor(Ot, device=self.device))
        # weights = torch.tensor(weights, device=self.device)
        with torch.no_grad():
            actions_pred, log_probs_pred = self.policy(observations)
        ent_coef = torch.exp(self.log_ent_coef.detach())
        ent_coef_loss = -(self.log_ent_coef * (log_probs_pred + self.target_entropy).detach()).mean()
        self.ent_coef_optimizer.zero_grad()
        ent_coef_loss.backward()
        self.ent_coef_optimizer.step()

        with th.no_grad():
            next_actions, next_log_prob = self.policy(next_observations)
            targets = self.policy.critic_target(next_observations, next_actions)
            targets = torch.stack(targets, dim=1)
            target_q, _ = torch.min(targets,dim=1)

            target_q = target_q - ent_coef * next_log_prob
            # td error + entropy term
            q_backup = rewards + (1 - done) * self.gamma * target_q

        current_q_estimates = self.policy.critic(observations, actions)
        critic_loss = mean([F.mse_loss(current_q, q_backup) for current_q in current_q_estimates])

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        q_values_pi = th.stack(self.policy.critic.forward(observations, actions_pred), dim=1)
        min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True)
        actor_loss = (ent_coef * log_probs_pred - min_qf_pi).mean()

        self.policy_optimizer.zero_grad()
        actor_loss.backward()
        self.policy_optimizer.step()

        polyak_update(self.policy.critic.parameters(), self.policy.critic_target.parameters(), self.tau)

        logger = self.logger
        logger.record_mean("ent_coef_loss",ent_coef_loss.item())
        logger.record_mean("critic_loss",critic_loss.item())
        logger.record_mean("actor_loss",actor_loss.item())
        logger.record_mean("q_backup",q_backup.mean().item())
        logger.record("policy_lr",self.policy_optimizer.get_last_lr())
Esempio n. 5
0
    def _on_update(self) -> None:
        if self._n_updates % self.target_update_interval == 0:
            polyak_update(self.q_net.parameters(), self.q_net_target.parameters(), self.tau)
            if not self.share:
                polyak_update(self.v_mlp_extractor.parameters(), self.v_mlp_extractor_target.parameters(), self.tau)
        
        
        if self.KL:

            if self._n_updates % 2 == 0:
            # if self.vloss_tracker.full and self.vloss_tracker.mean() < 5:
                self.train_mode='policy'
                '''
                careful that train() will call _on_update()
                so must clear before train
                '''
                self.vloss_tracker.clear() 
                self.train(gradient_steps=self.gradient_steps, batch_size=self.batch_size)
                # self.train(gradient_steps=self.replay_buffer.size()*2//self.batch_size, batch_size=self.batch_size)
                self.train_mode='value'
                # print("policy updated")

        if self._n_updates % self.behav_update_interval == 0:
            
            polyak_update(self.action_net.parameters(), self.behav_net.parameters(), tau=self.behav_tau)
            if not self.share:
                polyak_update(self.a_mlp_extractor.parameters(), self.a_mlp_extractor_target.parameters(), tau=self.behav_tau)
            self.trajectories = [Trajectory(self.device) for i in range(self.n_envs)]
            self.trajectory_buffer.reset()
            self.exploration_rate = self.exploration_schedule(self._current_progress_remaining)
Esempio n. 6
0
    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
        # Switch to train mode (this affects batch norm / dropout)
        self.policy.set_training_mode(True)
        # Update optimizers learning rate
        optimizers = [self.actor.optimizer, self.critic.optimizer]
        if self.ent_coef_optimizer is not None:
            optimizers += [self.ent_coef_optimizer]

        # Update learning rate according to lr schedule
        self._update_learning_rate(optimizers)

        ent_coef_losses, ent_coefs = [], []
        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)

            # We need to sample because `log_std` may have changed between two gradient steps
            if self.use_sde:
                self.actor.reset_noise()

            # Action by the current actor for the sampled state
            actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations)
            log_prob = log_prob.reshape(-1, 1)

            ent_coef_loss = None
            if self.ent_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = th.exp(self.log_ent_coef.detach())
                ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean()
                ent_coef_losses.append(ent_coef_loss.item())
            else:
                ent_coef = self.ent_coef_tensor

            ent_coefs.append(ent_coef.item())

            # Optimize entropy coefficient, also called
            # entropy temperature or alpha in the paper
            if ent_coef_loss is not None:
                self.ent_coef_optimizer.zero_grad()
                ent_coef_loss.backward()
                self.ent_coef_optimizer.step()

            with th.no_grad():
                # Select action according to policy
                next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations)
                # Compute the next Q values: min over all critics targets
                next_q_values = th.cat(self.critic_target(replay_data.next_observations, next_actions), dim=1)
                next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
                # add entropy term
                next_q_values = next_q_values - ent_coef * next_log_prob.reshape(-1, 1)
                # td error + entropy term
                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values

            # Get current Q-values estimates for each critic network
            # using action from the replay buffer
            current_q_values = self.critic(replay_data.observations, replay_data.actions)

            # Compute critic loss
            critic_loss = 0.5 * sum([F.mse_loss(current_q, target_q_values) for current_q in current_q_values])
            critic_losses.append(critic_loss.item())

            # Optimize the critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Compute actor loss
            # Alternative: actor_loss = th.mean(log_prob - qf1_pi)
            # Mean over all critic networks
            q_values_pi = th.cat(self.critic.forward(replay_data.observations, actions_pi), dim=1)
            min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True)
            actor_loss = (ent_coef * log_prob - min_qf_pi).mean()
            actor_losses.append(actor_loss.item())

            # Optimize the actor
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            # Update target networks
            if gradient_step % self.target_update_interval == 0:
                polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
        self.logger.record("train/ent_coef", np.mean(ent_coefs))
        self.logger.record("train/actor_loss", np.mean(actor_losses))
        self.logger.record("train/critic_loss", np.mean(critic_losses))
        if len(ent_coef_losses) > 0:
            self.logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
Esempio n. 7
0
    def train(self, gradient_steps: int, batch_size: int) -> None:
        # Set mppisac coeficient (either constant or using schedule)
        if isinstance(self.mppisac_coef, float):
            mppisac_coef = self.mppisac_coef
            assert (mppisac_coef <=
                    1.0) and (mppisac_coef >=
                              0.0), "MPPISAC_coef should be between 0.0 to 1.0"
        else:
            mppisac_coef = self.mppisac_coef_schedule.value(
                step=self.num_timesteps - self.learning_starts,
                total_steps=self._total_timesteps)
        # Update optimizers learning rate
        optimizers = [self.actor.optimizer, self.critic.optimizer]
        if self.ent_coef_optimizer is not None:
            optimizers += [self.ent_coef_optimizer]

        # Update learning rate according to lr schedule
        self._update_learning_rate(optimizers)

        ent_coef_losses, ent_coefs = [], []
        actor_losses, critic_losses, mppisac_losses = [], [], []
        mb_train_losses, mb_valid_losses = [], []

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            # train model based MPPICTRL dynamic model
            if mppisac_coef > 0.0:
                mb_valid_loss = self.mbctrl.validate(
                    states=replay_data.observations,
                    next_states=replay_data.next_observations,
                    actions=replay_data.actions,
                )
                mb_valid_losses.append(mb_valid_loss)

                mb_train_loss = self.mbctrl.train(
                    states=replay_data.observations,
                    next_states=replay_data.next_observations,
                    actions=replay_data.actions,
                )
                mb_train_losses.append(np.mean(mb_train_loss))
                # MPPICTRL's suggested actions for observations
                # t1 = time.time()
                if self.mppisac_nprocesses == 1:
                    actions_mb = self.mbctrl.act(replay_data.observations)
                else:
                    with Pool(self.mppisac_nprocesses) as pool:
                        actions_mb_mp = pool.map(self.mbctrl.act,
                                                 replay_data.observations)
                    actions_mb = torch.stack(actions_mb_mp)
                # t2 = time.time()
                # to_log = f"Processing {replay_data.observations.shape[0]} observations in {t2-t1:.2f} seconds [{(t2-t1)/replay_data.observations.shape[0]:.3f}sec/action] using {self.mppisac_nprocesses} cpus"
                # logger.log(to_log, level=20)

                actions_mb = self.policy.scale_action(actions_mb)
                # Deterministic actions from current SAC actor to be used for behavioral cloning loss
                actions_pi_deterministic = self.actor(
                    obs=replay_data.observations, deterministic=True)

            # We need to sample because `log_std` may have changed between two gradient steps
            if self.use_sde:
                self.actor.reset_noise()

            # Action by the current actor for the sampled state
            actions_pi, log_prob = self.actor.action_log_prob(
                replay_data.observations)
            log_prob = log_prob.reshape(-1, 1)

            ent_coef_loss = None
            if self.ent_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = torch.exp(self.log_ent_coef.detach())
                ent_coef_loss = -(
                    self.log_ent_coef *
                    (log_prob + self.target_entropy).detach()).mean()
                ent_coef_losses.append(ent_coef_loss.item())
            else:
                ent_coef = self.ent_coef_tensor

            ent_coefs.append(ent_coef.item())

            # Optimize entropy coefficient, also called
            # entropy temperature or alpha in the paper
            if ent_coef_loss is not None:
                self.ent_coef_optimizer.zero_grad()
                ent_coef_loss.backward()
                self.ent_coef_optimizer.step()

            with torch.no_grad():
                # Select action according to policy
                next_actions, next_log_prob = self.actor.action_log_prob(
                    replay_data.next_observations)
                # Compute the target Q value: min over all critics targets
                targets = torch.cat(self.critic_target(
                    replay_data.next_observations, next_actions),
                                    dim=1)
                target_q, _ = torch.min(targets, dim=1, keepdim=True)
                # add entropy term
                target_q = target_q - ent_coef * next_log_prob.reshape(-1, 1)
                # td error + entropy term
                q_backup = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q

                # q(s, MPPI_action) FOR USING DIFF Qs
                if mppisac_coef > 0.0 and self.mppisac_use_qdiff:
                    actions_mb_qs = torch.cat(self.critic_target(
                        replay_data.observations, actions_mb),
                                              dim=1)
                    actions_mb_q, _ = torch.min(actions_mb_qs,
                                                dim=1,
                                                keepdim=True)

            # Get current Q estimates for each critic network
            # using action from the replay buffer
            current_q_estimates = self.critic(replay_data.observations,
                                              replay_data.actions)

            # Compute critic loss
            critic_loss = 0.5 * sum([
                F.mse_loss(current_q, q_backup)
                for current_q in current_q_estimates
            ])
            critic_losses.append(critic_loss.item())

            # Optimize the critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Compute actor loss
            # Alternative: actor_loss = th.mean(log_prob - qf1_pi)
            # Mean over all critic networks
            q_values_pi = torch.cat(self.critic.forward(
                replay_data.observations, actions_pi),
                                    dim=1)
            min_qf_pi, _ = torch.min(q_values_pi, dim=1, keepdim=True)
            sac_actor_loss = (ent_coef * log_prob - min_qf_pi).mean()

            # --- Behavioral Cloning (MPPISAC) loss ---
            if mppisac_coef > 0.0 and self.mppisac_use_qdiff:
                mppisac_loss = F.mse_loss(min_qf_pi,
                                          actions_mb_q,
                                          reduction="none").mean()  # using qs
            elif mppisac_coef > 0.0:
                mppisac_loss = F.mse_loss(
                    actions_pi_deterministic, actions_mb,
                    reduction="none").mean()  # using actions
            else:
                mppisac_loss = torch.tensor(0.0)
            mppisac_losses.append(mppisac_loss.item())

            actor_loss = (1.0 - mppisac_coef
                          ) * sac_actor_loss + mppisac_coef * mppisac_loss
            actor_losses.append(actor_loss.item())

            # Optimize the actor
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            # Update target networks
            if gradient_step % self.target_update_interval == 0:
                polyak_update(self.critic.parameters(),
                              self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/ent_coef", np.mean(ent_coefs))
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        logger.record("train/mppi-sac_loss", np.mean(mppisac_losses))
        logger.record("train/mppi-sac_coef", mppisac_coef)
        logger.record("mbctrl/train_loss", np.mean(mb_train_losses))
        logger.record("mbctrl/valid_loss", np.mean(mb_valid_losses))
        if len(ent_coef_losses) > 0:
            logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))

        self._dump_logs()
Esempio n. 8
0
    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
        # Update optimizers learning rate
        optimizers = [self.actor.optimizer, self.critic.optimizer]

        # Update learning rate according to lr schedule
        self._update_learning_rate(optimizers)

        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)
            batch_size = replay_data.observations.size(0)

            # Critic update
            with th.no_grad():
                target_next_actions = self.actor_target.forward(
                    replay_data.next_observations)
                target_next_actions_q, _ = self.critic_target.forward(
                    replay_data.next_observations, target_next_actions,
                    self.action_dist_samples)
                target_next_actions_q = target_next_actions_q.transpose(1, 2)
                target_expected_Q = replay_data.rewards.unsqueeze(-1) + \
                    (1 - replay_data.dones.unsqueeze(-1)) * self.gamma * target_next_actions_q

            expected_Q, taus = self.critic.forward(replay_data.observations,
                                                   replay_data.actions,
                                                   self.action_dist_samples)

            # Quantile Huber loss
            td_error = target_expected_Q - expected_Q
            huber_1 = calculate_huber_loss(td_error, 1.0)
            quantil_1 = abs(taus -
                            (td_error.detach() < 0).float()) * huber_1 / 1.0

            critic_loss = (quantil_1.sum(dim=1).mean(dim=1, keepdim=True) *
                           replay_data.weights).mean()
            critic_losses.append(critic_loss.item())

            # Optimize critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            clip_grad_norm_(self.critic.parameters(), 1)
            self.critic.optimizer.step()

            # Actor update
            actions = self.actor.forward(replay_data.observations)
            actions_q = self.critic.get_qvalues(replay_data.observations,
                                                actions)
            actor_loss = -actions_q.mean()

            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            actor_losses.append(actor_loss.item())

            self.replay_buffer.update_priorities(
                replay_data.indices,
                np.clip(
                    abs(
                        td_error.sum(dim=1).mean(
                            dim=1, keepdim=True).data.cpu().numpy()), -1, 1))

            if gradient_step % self.target_update_interval == 0:
                polyak_update(self.actor.parameters(),
                              self.actor_target.parameters(), self.tau)
                polyak_update(self.critic.parameters(),
                              self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
Esempio n. 9
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:

        # Update learning rate according to lr schedule
        self._update_learning_rate(
            [self.actor.optimizer, self.critic.optimizer])

        actor_losses, critic_losses = [], []
        for i in range(gradient_steps):

            use_bc_loss = False
            self._n_updates += 1
            updatas = 0
            # Sample replay buffer
            #if self.use_expert_demonstration =True and updatas< (gradient_steps/2):
            #self.use_expert_demonstration=0
            if self._n_updates % 2 == 0 and self.use_expert_demonstration == 1:

                replay_data = self.expert_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                use_bc_loss = True
                #print('buffer_sample')
                #print(replay_data.actions)
            else:
                replay_data = self.replay_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                #print('replay_sample')
                #print(replay_data.actions)

            #replay_expert_data=self.expert_buffer.sample(batch_size, env=self._vec_normalize_env)
            #replay_data=self.replay_buffer.sample(batch_size,env=self._vec_normalize_env)

            #print(test_data.next_observations[0])
            #replay_data = self.expert_buffer.sample(batch_size, env=self._vec_normalize_env)
            #print(type(replay_data.next_observations))
            # replay_data = self.expert_buffer.sample(batch_size, env=self._vec_normalize_env)
            #use_bc_loss=True

            with th.no_grad():

                # Select action according to policy and add clipped noise
                noise = replay_data.actions.clone().data.normal_(
                    0, self.target_policy_noise)
                noise = noise.clamp(-self.target_noise_clip,
                                    self.target_noise_clip)
                next_actions = (
                    self.actor_target(replay_data.next_observations) +
                    noise).clamp(-1, 1)

                # Compute the next Q-values: min over all critics targets
                next_q_values = th.cat(self.critic_target(
                    replay_data.next_observations, next_actions),
                                       dim=1)
                next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
                target_q_values = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * next_q_values

            # Get current Q-values estimates for each critic network
            current_q_values = self.critic(replay_data.observations,
                                           replay_data.actions)

            # Compute critic loss
            critic_loss = sum([
                F.mse_loss(current_q, target_q_values)
                for current_q in current_q_values
            ])
            critic_losses.append(critic_loss.item())

            # Optimize the critics
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Delayed policy updates
            if self._n_updates % self.policy_delay == 0:
                if use_bc_loss == True:
                    # Compute actor loss#
                    current_actions = self.actor(replay_data.observations)
                    bc_loss = sum([
                        F.mse_loss(current_action, replay_data.actions)
                        for current_action in current_actions
                    ])
                    actor_loss = -self.critic.q1_forward(
                        replay_data.observations,
                        self.actor(replay_data.observations)).mean() + bc_loss
                    actor_losses.append(actor_loss.item())
                else:
                    actor_loss = -self.critic.q1_forward(
                        replay_data.observations,
                        self.actor(replay_data.observations)).mean()
                    actor_losses.append(actor_loss.item())

                # Optimize the actor
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()

                polyak_update(self.critic.parameters(),
                              self.critic_target.parameters(), self.tau)
                polyak_update(self.actor.parameters(),
                              self.actor_target.parameters(), self.tau)

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        if len(actor_losses) > 0:
            logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
Esempio n. 10
0
def run(params: argparse.Namespace):
    Task.init()
    sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda)
    writer = helper.get_summary_writer(__file__, params)
    env = helper.make_env(params, 'env')

    q = network.get_model_class(params)(env).to(device)
    q_hat = network.get_model_class(params)(env).to(device)
    q_hat.load_state_dict(q.state_dict())

    replay_buffer = ReplayBuffer(params.replay_size)
    # todo check optimizer
    opt = optim.Adam(q.parameters(), lr=params.learning_rate)

    all_rewards = []
    state = env.reset()
    episode_reward = [0]
    episode_no = 0
    for t in range(1, params.max_ts + 1):
        # order of terms important so that the call to 'next(eps)'
        # does not decrease epsilon
        epsilon = get_epsilon(params.epsilon_start, params.epsilon_end,
                              params.epsilon_decay, t)
        if random.random() < epsilon:
            a = random.randrange(env.action_space.n)
        else:
            val = q(np.expand_dims(state, axis=0))
            a = torch.argmax(val).item()
            # equivalent to q(...).max(1)[1].data[0]
            # (selects max tensor with .max(1) and its index with ...[1])
        s_tp1, r, done, infos = env.step(a)
        episode_reward = list(map(add, episode_reward, [r]))
        replay_buffer.add(state, a, r, s_tp1, done)
        state = s_tp1
        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = [0]
            episode_no += 1

        # replay buffer reached minimum capacity
        if len(replay_buffer) > params.start_train_ts:
            obses_t, actions, rewards, obses_tp1, dones = \
                replay_buffer.sample(params.batch_size)
            rewards = torch.tensor(rewards, dtype=torch.float32) \
                .unsqueeze(1).to(device)
            actions = torch.tensor(actions).unsqueeze(1).to(device)
            dones = torch.tensor(dones).unsqueeze(1).to(device)
            if True:
                with torch.no_grad():
                    # Compute the target Q values
                    target_q = q_hat(obses_tp1)
                    # Follow greedy policy: use the one with the highest value
                    target_q, _ = target_q.max(dim=1)
                    # Avoid potential broadcast issue
                    target_q = target_q.reshape(-1, 1)
                    # 1-step TD target
                    target_q = rewards + ~dones * params.gamma * target_q

                # Get current Q estimates
                current_q = q(obses_t)

                # Retrieve the q-values for the actions from the replay buffer
                current_q = torch.gather(current_q,
                                         dim=1,
                                         index=actions.long())

                # Compute Huber loss (less sensitive to outliers)
                loss = F.smooth_l1_loss(current_q, target_q)

            else:
                val_tp1 = q(obses_tp1)
                val_t = q(obses_t)
                val_hat_tp1 = q_hat(obses_tp1)
                # .T to iterate over columns of the array: https://stackoverflow.com/a/10148855/256002

                r = torch.from_numpy(rewards).to(device)
                #if params.summed_q:
                #    head = heads[idx]
                #else:
                #    head = heads[mirrored_envs.use_for_decisions_idx]
                a = torch.argmax(val_tp1, dim=1)
                td_errors = r + ~dones * params.gamma * val_hat_tp1.gather(
                    1, a.unsqueeze(1)).squeeze()
                q_vals = val_t.gather(1, actions).squeeze()
                #loss = (td_errors.detach() - q_vals).pow(2).mean()
                loss = F.smooth_l1_loss(q_vals, td_errors.detach())

            if done:
                writer.add_scalar("loss_idx", loss.data, episode_no)
                writer.add_scalar("total_loss", loss.data, episode_no)

            # Optimize the policy
            opt.zero_grad()
            loss.backward()
            # Clip gradient norm
            torch.nn.utils.clip_grad_norm_(q.parameters(),
                                           params.max_grad_norm)
            opt.step()
            if t % params.target_network_update_f == 0:
                print('weights copied')
                sb3_utils.polyak_update(q.parameters(), q_hat.parameters(),
                                        1.0)

        if done:
            for idx, ep_reward in enumerate(all_rewards[-1]):
                helper.add_scalar(writer, "episode_reward_idx{}".format(idx),
                                  ep_reward, episode_no, params)
            helper.add_scalar(writer, "steps_count", infos['steps_count'],
                              episode_no, params)

            if episode_no % params.log_interval == 0:
                #print('replaybuffer size:', len(replay_buffer))
                out_str = "Timestep {}".format(t)
                if len(all_rewards) > 0:
                    out_str += ",Reward:{}".format(all_rewards[-1])
                out_str += ", done: {}".format(done)
                out_str += ', steps_count {}'.format(infos['steps_count'])
                out_str += ', epsilon {}'.format(epsilon)
                print(out_str)
    helper.close_summary_writer(writer)
Esempio n. 11
0
    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
        # Update optimizers learning rate
        optimizers = [self.actor.optimizer, self.critic.optimizer]
        if self.ent_coef_optimizer is not None:
            optimizers += [self.ent_coef_optimizer]

        # Update learning rate according to lr schedule
        self._update_learning_rate(optimizers)

        ent_coef_losses, ent_coefs = [], []
        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            # We need to sample because `log_std` may have changed between two gradient steps
            if self.use_sde:
                self.actor.reset_noise()

            # Action by the current actor for the sampled state
            actions_pi, log_prob = self.actor.action_log_prob(
                replay_data.observations)
            log_prob = log_prob.reshape(-1, 1)

            ent_coef_loss = None
            if self.ent_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = th.exp(self.log_ent_coef.detach())
                ent_coef_loss = -(
                    self.log_ent_coef *
                    (log_prob + self.target_entropy).detach()).mean()
                ent_coef_losses.append(ent_coef_loss.item())
            else:
                ent_coef = self.ent_coef_tensor

            ent_coefs.append(ent_coef.item())
            self.replay_buffer.ent_coef = ent_coef.item()

            # Optimize entropy coefficient, also called
            # entropy temperature or alpha in the paper
            if ent_coef_loss is not None:
                self.ent_coef_optimizer.zero_grad()
                ent_coef_loss.backward()
                self.ent_coef_optimizer.step()

            with th.no_grad():
                # Select action according to policy
                next_actions, next_log_prob = self.actor.action_log_prob(
                    replay_data.next_observations)
                # Compute and cut quantiles at the next state
                # batch x nets x quantiles
                next_quantiles = self.critic_target(
                    replay_data.next_observations, next_actions)

                # Sort and drop top k quantiles to control overestimation.
                n_target_quantiles = self.critic.quantiles_total - self.top_quantiles_to_drop_per_net * self.critic.n_critics
                next_quantiles, _ = th.sort(
                    next_quantiles.reshape(batch_size, -1))
                next_quantiles = next_quantiles[:, :n_target_quantiles]

                # td error + entropy term
                target_quantiles = next_quantiles - ent_coef * next_log_prob.reshape(
                    -1, 1)
                target_quantiles = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_quantiles
                # Make target_quantiles broadcastable to (batch_size, n_critics, n_target_quantiles).
                target_quantiles.unsqueeze_(dim=1)

            # Get current Quantile estimates using action from the replay buffer
            current_quantiles = self.critic(replay_data.observations,
                                            replay_data.actions)
            # Compute critic loss, not summing over the quantile dimension as in the paper.
            critic_loss = quantile_huber_loss(current_quantiles,
                                              target_quantiles,
                                              sum_over_quantiles=False)
            critic_losses.append(critic_loss.item())

            # Optimize the critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Compute actor loss
            qf_pi = self.critic(replay_data.observations,
                                actions_pi).mean(dim=2).mean(dim=1,
                                                             keepdim=True)
            actor_loss = (ent_coef * log_prob - qf_pi).mean()
            actor_losses.append(actor_loss.item())

            # Optimize the actor
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            # Update target networks
            if gradient_step % self.target_update_interval == 0:
                polyak_update(self.critic.parameters(),
                              self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/ent_coef", np.mean(ent_coefs))
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        if len(ent_coef_losses) > 0:
            logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
Esempio n. 12
0
    def train_mer(self, gradient_steps: int, batch_size: int = 64) -> None:
        optimizers = [self.actor.optimizer, self.critic.optimizer]
        if self.ent_coef_optimizer is not None:
            optimizers += [self.ent_coef_optimizer]

        optimizers_to_reset = optimizers

        # Reset optimizers:
        for i_optimizer, optimizer in enumerate(optimizers_to_reset):
            optimizer.__init__(optimizer.param_groups[0]['params'])
            optimizers[i_optimizer] = optimizer

        # Update optimizers learning rate
        base_lr = self.lr_schedule(self._current_progress_remaining)

        ent_coef_losses, ent_coefs = [], []
        actor_losses, critic_losses = [], []

        # Save initial weights of model - for actor and critic but not for critic_target
        models_to_update = [self.actor, self.critic]
        initial_state_dicts = [
            deepcopy(model.state_dict()) for model in models_to_update
        ]
        initial_log_ent_coef = self.log_ent_coef.detach().clone()

        current_example_ind = randint(0, gradient_steps - 1)
        for gradient_step in range(gradient_steps):
            # Sample replay buffer or current example, and update learning rate accordingly
            if gradient_step == current_example_ind:
                replay_data = self.current_experience_buffer.sample(
                    1, env=self._vec_normalize_env)
                for optimizer in optimizers:
                    update_learning_rate(optimizer, base_lr * self.mer_s)
            else:
                replay_data = self.replay_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                for optimizer in optimizers:
                    update_learning_rate(optimizer, base_lr)

            # We need to sample because `log_std` may have changed between two gradient steps
            if self.use_sde:
                self.actor.reset_noise()

            # Action by the current actor for the sampled state
            actions_pi, log_prob = self.actor.action_log_prob(
                replay_data.observations)
            log_prob = log_prob.reshape(-1, 1)

            ent_coef_loss = None
            if self.ent_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = th.exp(self.log_ent_coef.detach())
                ent_coef_loss = -(
                    self.log_ent_coef *
                    (log_prob + self.target_entropy).detach()).mean()
                ent_coef_losses.append(ent_coef_loss.item())
            else:
                ent_coef = self.ent_coef_tensor

            ent_coefs.append(ent_coef.item())

            # Optimize entropy coefficient, also called
            # entropy temperature or alpha in the paper
            if ent_coef_loss is not None:
                self.ent_coef_optimizer.zero_grad()
                ent_coef_loss.backward()
                self.ent_coef_optimizer.step()

            with th.no_grad():
                # Select action according to policy
                next_actions, next_log_prob = self.actor.action_log_prob(
                    replay_data.next_observations)
                # Compute the target Q value: min over all critics targets
                targets = th.cat(self.critic_target(
                    replay_data.next_observations, next_actions),
                                 dim=1)
                target_q, _ = th.min(targets, dim=1, keepdim=True)
                # add entropy term
                target_q = target_q - ent_coef * next_log_prob.reshape(-1, 1)
                # td error + entropy term
                q_backup = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q

            # Get current Q estimates for each critic network
            # using action from the replay buffer
            current_q_estimates = self.critic(replay_data.observations,
                                              replay_data.actions)

            # Compute critic loss
            critic_loss = 0.5 * sum([
                F.mse_loss(current_q, q_backup)
                for current_q in current_q_estimates
            ])
            critic_losses.append(critic_loss.item())

            # Optimize the critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Compute actor loss
            # Alternative: actor_loss = th.mean(log_prob - qf1_pi)
            # Mean over all critic networks
            q_values_pi = th.cat(self.critic.forward(replay_data.observations,
                                                     actions_pi),
                                 dim=1)
            min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True)
            actor_loss = (ent_coef * log_prob - min_qf_pi).mean()
            actor_losses.append(actor_loss.item())

            # Optimize the actor
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

        # Perform Reptile step
        for i_model, model in enumerate(models_to_update):
            self.reptile_step_state_dict(model, initial_state_dicts[i_model])
        self.log_ent_coef.data = self.reptile_step_tensor(
            self.log_ent_coef.data, initial_log_ent_coef.data)

        # Update target networks
        polyak_update(self.critic.parameters(),
                      self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/ent_coef", np.mean(ent_coefs))
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        if len(ent_coef_losses) > 0:
            logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
Esempio n. 13
0
    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
        # Update optimizers learning rate
        optimizers = [self.actor.optimizer, self.critic.optimizer]

        # Update learning rate according to lr schedule
        self._update_learning_rate(optimizers)

        mean_loss_q, mean_loss_p, mean_loss_l, max_kl_μ, max_kl_Σ, max_kl = [], [], [], [], [], []
        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)
            batch_size = replay_data.observations.size(0)

            with th.no_grad():
                # Sample "action_samples" num additional actions
                target_next_action_mean, target_next_action_cholesky, _ = self.actor_target.get_action_dist_params(
                    replay_data.next_observations)
                target_next_action_dist = MultivariateNormal(
                    target_next_action_mean,
                    scale_tril=target_next_action_cholesky)
                target_sampled_next_actions = target_next_action_dist.sample(
                    (self.action_samples, )).transpose(0, 1)

                # Compute mean of q values for the samples
                # Expand next_observation to match self.action_samples
                expanded_next_observations = replay_data.next_observations[:, None, :].expand(
                    -1, self.action_samples, -1)
                target_sampled_next_actions_expected_q = get_min_critic_tensor(
                    self.critic_target.forward(
                        expanded_next_observations.reshape(
                            -1, self.features_dim),
                        target_sampled_next_actions.reshape(
                            -1, self.action_dim))).reshape(
                                batch_size, self.action_samples).mean(dim=1)

                # Compute total expected return
                target_sampled_expected_return = replay_data.rewards.squeeze() + (1 - replay_data.dones.squeeze()) * self.gamma * \
                    target_sampled_next_actions_expected_q

            # Optimize the critic
            critic_qs = self.critic.forward(replay_data.observations,
                                            replay_data.actions)
            critic_loss = 0.5 * sum([
                self.critic_loss(current_q.squeeze(),
                                 target_sampled_expected_return)
                for current_q in critic_qs
            ])
            critic_losses.append(critic_loss.item())

            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Sample additional actions for E-Step
            with th.no_grad():
                target_action_mean, target_action_cholesky, _ = self.actor_target.get_action_dist_params(
                    replay_data.observations)
                target_action_dist = MultivariateNormal(
                    target_action_mean, scale_tril=target_action_cholesky)
                sampled_actions = target_action_dist.sample(
                    (self.action_samples, ))

                # Compute q values for the samples
                # Expand next_observation to match self.action_samples
                expanded_observations = replay_data.observations[
                    None, ...].expand(self.action_samples, -1, -1)
                target_sampled_actions_expected_q = get_min_critic_tensor(
                    self.critic_target.forward(
                        expanded_observations.reshape(-1, self.features_dim),
                        sampled_actions.reshape(-1, self.action_dim))).reshape(
                            self.action_samples, batch_size)
                target_sampled_actions_expected_q_np = target_sampled_actions_expected_q.cpu(
                ).numpy()

            # Define dual function
            def dual(η):
                max_q = np.max(target_sampled_actions_expected_q_np, 0)
                return η * self.ε_dual + np.mean(max_q) \
                    + η * np.mean(np.log(np.mean(np.exp((target_sampled_actions_expected_q_np - max_q) / η), axis=0)))

            bounds = [(1e-6, None)]
            self.η = np.max([self.η, 1e-6])
            res = minimize(dual,
                           np.array([self.η]),
                           method='SLSQP',
                           bounds=bounds)
            self.η = res.x[0]

            qij = th.softmax(target_sampled_actions_expected_q / self.η, dim=0)

            # M-Step
            for _ in range(self.lagrange_iterations):
                action_mean, action_cholesky, _ = self.actor.get_action_dist_params(
                    replay_data.observations)
                π1 = MultivariateNormal(action_mean,
                                        scale_tril=target_action_cholesky)
                π2 = MultivariateNormal(target_action_mean,
                                        scale_tril=action_cholesky)
                loss_p = th.mean(qij * (π1.expand(
                    (self.action_samples,
                     batch_size)).log_prob(sampled_actions) + π2.expand(
                         (self.action_samples,
                          batch_size)).log_prob(sampled_actions)))
                mean_loss_p.append((-loss_p).item())

                kl_μ, kl_Σ = gaussian_kl(μ_target=target_action_mean,
                                         μ=action_mean,
                                         A_target=target_action_cholesky,
                                         A=action_cholesky)
                max_kl_μ.append(kl_μ.item())
                max_kl_Σ.append(kl_Σ.item())

                self.η_kl_μ -= self.α * (self.ε_kl_μ - kl_μ).detach().item()
                self.η_kl_Σ -= self.α * (self.ε_kl_Σ - kl_Σ).detach().item()

                if self.η_kl_μ < 0.0:
                    self.η_kl_μ = 0.0
                if self.η_kl_Σ < 0.0:
                    self.η_kl_Σ = 0.0

                self.actor.optimizer.zero_grad()
                actor_loss = -(loss_p + self.η_kl_μ *
                               (self.ε_kl_μ - kl_μ) + self.η_kl_Σ *
                               (self.ε_kl_Σ - kl_Σ))
                actor_losses.append(actor_loss.item())

                # Optimize actor
                actor_loss.backward()
                clip_grad_norm_(self.actor.parameters(), 0.1)
                self.actor.optimizer.step()

            if gradient_step % self.target_update_interval == 0:
                polyak_update(self.actor.parameters(),
                              self.actor_target.parameters(), self.tau)
                polyak_update(self.critic.parameters(),
                              self.critic_target.parameters(), self.tau)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        logger.record("train/actor_policy_loss", np.mean(mean_loss_p))
        logger.record("train/max_kl_mean", np.max(max_kl_μ))
        logger.record("train/mean_kl_mean", np.mean(max_kl_μ))
        logger.record("train/max_kl_std", np.max(max_kl_Σ))
        logger.record("train/mean_kl_std", np.mean(max_kl_Σ))