Ejemplo n.º 1
0
    def get_predictions(self, ac, features):
        """Get the current prediction of the dynamics model.

        Returns
        -------
        array
            Returns the output of the dynamics network
        TODO: reimplement chunking
        """
        # TODO: refactor this function, too many shape transformations in ac, confusing
        sh = ac.shape  # = [1, nsteps_per_seg]
        ac = flatten_dims(ac,
                          len(self.ac_space.shape))  # shape = [nsteps_per_seg]
        # Turn actions into one hot encoding
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1,
            ac.unsqueeze(1).type(torch.int64),
            1)  # shape = [nsteps_per_seg, ac_space.n]

        sh = features.shape  # [1, nsteps_per_seg, feature_dim]
        x = flatten_dims(features, 1)  # [nsteps_per_seg, feature_dim]
        assert x.shape[:-1] == ac.shape[:-1]

        # forward pass of actions and features in dynamics net
        x = self.dynamics_net(x, ac)

        # reshape
        x = unflatten_first_dim(x, sh)  # [1, nsteps_per_seg, feature_dim]

        return x
Ejemplo n.º 2
0
    def decoder(self, z):
        """Run latent space activations through the decoder model, apply spherical
        scaling if needed and get the distribution of the reconstructions.

        Parameters
        ----------
        z : Tensor
            Latent activations in the VAE after processing in the encoder.

        Returns
        -------
        tensor
            Reconstruction distribution.

        """
        z_has_timesteps = len(z.shape) == 3
        if z_has_timesteps:
            sh = z.shape
            z = flatten_dims(z, 1)
        # Run the latent vector trhough the decoder model
        z = self.decoder_model(z)
        # reshape if needed
        if z_has_timesteps:
            z = unflatten_first_dim(z, sh)

        # Calculate the scale parameter
        if self.spherical_obs:
            scale = torch.max(self.scale, torch.tensor(-4.0))
            scale = torch.nn.functional.softplus(scale)
            scale = scale * torch.ones(z.shape)
        else:
            z, scale = torch.split(z, [4, 4], -3)
            scale = torch.nn.functional.softplus(scale)
        # Return the scaled distribution of the decoder reconstruction.
        return torch.distributions.normal.Normal(z, scale)
Ejemplo n.º 3
0
    def get_features(self, obs):
        """Get features from the feature network.

        Parameters
        ----------
        obs : array
            Observation for which to get features.

        Returns
        -------
        array
            Features of the observations.

        """
        # TODO: refactor - too many shape transformations in obs and act, confusing
        has_timesteps = len(obs.shape) == 5
        if has_timesteps:
            sh = obs.shape  # shape=[1, nsteps, H, W, C]
            obs = flatten_dims(obs, len(
                self.ob_space.shape))  # shape=[nsteps, H, W, C]
        # Normalize observations
        obs = (obs - self.ob_mean) / self.ob_std
        # Reshape observations, shape=[nsteps, C, H, W]
        obs = obs.permute([i
                           for i in range(len(obs.shape) - 3)] + [-1, -3, -2])
        # Get features from the features_model
        act = self.features_model(obs)

        if has_timesteps:
            act = unflatten_first_dim(act, sh)

        return act
Ejemplo n.º 4
0
    def get_features(self, obs):
        """Get features from the feature network.

        Parameters
        ----------
        obs : array
            Observation for which to get features.

        Returns
        -------
        array
            Features of the observations.

        """
        has_timesteps = len(obs.shape) == 5
        if has_timesteps:
            sh = obs.shape  # shape=[1, nsteps, H, W, C]
            obs = flatten_dims(obs, len(
                self.ob_space.shape))  # shape=[nsteps, H, W, C]
        # Normalize observations
        obs = (obs - self.ob_mean) / self.ob_std
        # Reshape observations, shape=[nsteps, C, H, W]
        obs = np.transpose(obs,
                           [i
                            for i in range(len(obs.shape) - 3)] + [-1, -3, -2])
        # Get features from the features_model
        act = self.features_model(torch.tensor(obs).to(self.device))

        if has_timesteps:
            act = unflatten_first_dim(act, sh)

        return act
Ejemplo n.º 5
0
    def get_loss_partial(self):
        """Get the loss of the dynamics model with dropout. No use_disagreement is
        calculated here because the dynamics models are trained using the prediction
        error. The disagreement is only used as a reward signal for the policy.
        Dropout is added to the loss to enforce some variance between models while still
        using all of the data.

        Returns
        -------
        array
            Mean squared difference between the output and the next features.

        """
        ac = self.ac
        sh = ac.shape  # = [1, nsteps_per_seg]
        ac = flatten_dims(ac,
                          len(self.ac_space.shape))  # shape = [nsteps_per_seg]
        # Turn actions into one hot encoding
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1,
            torch.tensor(ac).unsqueeze(1),
            1)  # shape = [nsteps_per_seg, ac_space.n]

        features = self.features
        sh = features.shape  # [1, nsteps_per_seg, feature_dim]
        x = flatten_dims(features, 1)  # [nsteps_per_seg, feature_dim]
        assert x.shape[:-1] == ac.shape[:-1]

        # forward pass of actions and features in dynamics net
        x = self.dynamics_net(x.to(self.device), ac.to(self.device))

        # reshape
        x = unflatten_first_dim(x, sh)  # [1, nsteps_per_seg, feature_dim]
        # Take the mean-squared diff between out features (input was current
        # features and action) and next features (shape=[1, nsteps_per_seg])
        next_features = self.next_features
        loss = torch.mean((x - next_features)**2, -1)  # mean over frames
        # Apply dropout here to ensure variability between dynamics models. This is done
        # instead of bootstrapping the samples so that all samples can be used to train
        # every model.
        do = torch.nn.Dropout(p=0.2)
        do_loss = do(loss)
        return do_loss  # vector with mse for each feature
Ejemplo n.º 6
0
    def get_loss(self):
        """Calculate the auxiliary loss (backward loss). This is the cross entropy
        between the predicted action probabilities and the actions actually performed.

        Returns
        -------
        tensor
            Losses for each action prediction.

        """
        x = torch.cat([self.features, self.next_features], 2)
        x = flatten_dims(x, 1)
        # Get action probabilities for each action. shape=[nsteps_per_seg, act_dim]
        param = self.fc(x)
        # Create probability distribution from logits.
        idfpd = self.policy.ac_pdtype.pdfromflat(param)
        # Get the actions that were actually performed and flatten.
        # shape=[n_steps_per_seg]
        ac = flatten_dims(self.ac, len(self.ac_space.shape))
        # Calculate the cross entropy between the logits of the action predictions and
        # the actual actions. shape=[n_steps_per_seg, 1]
        return idfpd.neglogp(ac)
Ejemplo n.º 7
0
    def get_loss(self):
        """Get the current loss of the dynamics model.

        Returns
        -------
        array
            If use_disagreement=True returns the output of the loss network, otherwise
            the mean squared difference between the output and the next features.

        """
        ac = self.ac
        sh = ac.shape  # = [1, nsteps_per_seg]
        ac = flatten_dims(ac,
                          len(self.ac_space.shape))  # shape = [nsteps_per_seg]
        # Turn actions into one hot encoding
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1,
            torch.tensor(ac).unsqueeze(1),
            1)  # shape = [nsteps_per_seg, ac_space.n]

        features = self.features
        sh = features.shape  # [1, nsteps_per_seg, feature_dim]
        x = flatten_dims(features, 1)  # [nsteps_per_seg, feature_dim]
        assert x.shape[:-1] == ac.shape[:-1]

        # forward pass of actions and features in dynamics net
        x = self.dynamics_net(x.to(self.device), ac.to(self.device))

        # reshape
        x = unflatten_first_dim(x, sh)  # [1, nsteps_per_seg, feature_dim]
        if self.use_disagreement:
            # Return output from dynamics network
            # (shape=[1, nsteps_per_seg, next_feature_dim])
            return x
        else:
            # Take the mean-squared diff between out features (input was current
            # features and action) and next features (shape=[1, nsteps_per_seg])
            next_features = self.next_features
            return torch.mean((x - next_features)**2, -1)
    def update(self):
        """Calculate losses and update parameters based on current rollout.

        Returns
        -------
        info
            Dictionary of infos about the current update and training statistics.

        """
        if self.normrew:
            # Normalize the rewards using the running mean and std
            discounted_rewards = np.array([
                self.reward_forward_filter.update(rew)
                for rew in self.rollout.buf_rewards.T
            ])
            # rewards_mean, rewards_std, rewards_count
            rewards_mean, rewards_std, rewards_count = mpi_moments(
                discounted_rewards.ravel())
            # reward forward filter running mean std
            self.reward_stats.update_from_moments(rewards_mean, rewards_std**2,
                                                  rewards_count)
            rews = self.rollout.buf_rewards / np.sqrt(self.reward_stats.var)
        else:
            rews = np.copy(self.rollout.buf_rewards)

        # Calculate advantages using the current rewards and value estimates
        self.calculate_advantages(rews=rews,
                                  use_done=self.use_done,
                                  gamma=self.gamma,
                                  lam=self.lam)

        # Initialize and update the info dict for logging
        info = dict()
        info["ppo/advantage_mean"] = self.buf_advantages.mean()
        info["ppo/advantage_std"] = self.buf_advantages.std()
        info["ppo/return_mean"] = self.buf_returns.mean()
        info["ppo/return_std"] = self.buf_returns.std()
        info["ppo/value_est_mean"] = self.rollout.buf_vpreds.mean()
        info["ppo/value_est_std"] = self.rollout.buf_vpreds.std()
        info["ppo/explained_variance"] = explained_variance(
            self.rollout.buf_vpreds.ravel(), self.buf_returns.ravel())
        info["ppo/reward_mean"] = np.mean(self.rollout.buf_rewards)

        if self.rollout.best_ext_return is not None:
            info["performance/best_ext_return"] = self.rollout.best_ext_return
        # TODO: maybe add extra flag for detailed logging so runs are not slowed down
        if not self.debugging:
            feature_stats, stacked_act_feat = self.get_activation_stats(
                self.rollout.buf_acts_features, "activations_features/")
            hidden_stats, stacked_act_pi = self.get_activation_stats(
                self.rollout.buf_acts_pi, "activations_hidden/")
            info.update(feature_stats)
            info.update(hidden_stats)

            info[
                "activations_features/raw_act_distribution"] = wandb.Histogram(
                    stacked_act_feat)
            info["activations_hidden/raw_act_distribution"] = wandb.Histogram(
                stacked_act_pi)

            info["ppo/action_distribution"] = wandb.Histogram(
                self.rollout.buf_acs.flatten())

            if self.vLogFreq >= 0 and self.n_updates % self.vLogFreq == 0:
                print(str(self.n_updates) + " updates - logging video.")
                # Reshape images such that they have shape [time,channels,width,height]
                sample_video = np.moveaxis(self.rollout.buf_obs[0], 3, 1)
                # Log buffer video from first env
                info["observations"] = wandb.Video(sample_video,
                                                   fps=12,
                                                   format="gif")

        to_report = Counter()

        if self.normadv:  # defaults to True
            # normalize advantages
            m, s = get_mean_and_std(self.buf_advantages)
            self.buf_advantages = (self.buf_advantages - m) / (s + 1e-7)
        # Set update hyperparameters
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        # Update the networks & get losses for nepochs * nminibatches
        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                minibatch_envinds = envinds[
                    start:end]  # minibatch environment indexes
                # Get rollout experiences for current minibatch
                acs = self.rollout.buf_acs[minibatch_envinds]
                rews = self.rollout.buf_rewards[minibatch_envinds]
                neglogprobs = self.rollout.buf_neglogprobs[
                    minibatch_envinds]  # negative log probabilities (action probabilities from pi)
                obs = self.rollout.buf_obs[minibatch_envinds]
                returns = self.buf_returns[minibatch_envinds]
                advantages = self.buf_advantages[minibatch_envinds]
                last_obs = self.rollout.buf_obs_last[minibatch_envinds]

                # Update features of the policy network to minibatch obs and acs
                self.policy.update_features(obs, acs)

                # Update features of the auxiliary network to minibatch obs and acs
                # Using first element in dynamics list is sufficient bc all dynamics
                # models have the same auxiliary task model and features
                # TODO: should the feature model be independent of dynamics?
                self.dynamics_list[0].auxiliary_task.update_features(
                    obs, last_obs)
                # Get the loss and variance of the feature model
                aux_loss = torch.mean(
                    self.dynamics_list[0].auxiliary_task.get_loss())
                # Take variance over steps -> [feature_dim] vars -> average
                # This is the average variance in a feature over time
                feature_var = torch.mean(
                    torch.var(self.dynamics_list[0].auxiliary_task.features,
                              [0, 1]))
                feature_var_2 = torch.mean(
                    torch.var(self.dynamics_list[0].auxiliary_task.features,
                              [2]))

                # disagreement = []
                dyn_prediction_loss = []
                # Loop through dynamics models
                for dynamic in self.dynamics_list:
                    # Get the features of the observations in the dynamics model (just
                    # gets features from the auxiliary model)
                    dynamic.update_features()
                    # Put features into dynamics model and get loss
                    # (if use_disagreement just returns features, therfor here the
                    # partial loss is used for optimizing and loging)
                    # disagreement.append(torch.mean(np.var(dynamic.get_loss(),axis=0)))

                    # Put features into dynamics model and get partial loss (dropout)
                    dyn_prediction_loss.append(
                        torch.mean(dynamic.get_loss_partial()))

                # Reshape actions and put in tensor
                acs = torch.tensor(flatten_dims(acs, len(
                    self.ac_space.shape))).to(self.device)
                # Get the negative log probs of the actions under the policy
                neglogprobs_new = self.policy.pd.neglogp(acs)
                # Get the entropy of the current policy
                entropy = torch.mean(self.policy.pd.entropy())
                # Get the value estimate of the policies value head
                vpred = self.policy.vpred
                # Calculate the msq difference between value estimate and return
                vf_loss = 0.5 * torch.mean(
                    (vpred.squeeze() - torch.tensor(returns).to(self.device))**
                    2)
                # Put old neglogprobs from buffer into tensor
                neglogprobs_old = torch.tensor(flatten_dims(neglogprobs,
                                                            0)).to(self.device)
                # Calculate exp difference between old nlp and neglogprobs_new
                # neglogprobs: negative log probability of the action (old)
                # neglogprobs_new: negative log probability of the action (new)
                ratio = torch.exp(neglogprobs_old - neglogprobs_new.squeeze())
                # Put advantages and negative advantages into tensors
                advantages = flatten_dims(advantages, 0)
                neg_advantages = torch.tensor(-advantages).to(self.device)
                # Calculate policy gradient loss. Once multiplied with original ratio
                # between old and new policy probs (1 if identical) and once with
                # clipped ratio.
                policy_gradient_losses1 = neg_advantages * ratio
                policy_gradient_losses2 = neg_advantages * torch.clamp(
                    ratio, min=1.0 - self.cliprange, max=1.0 + self.cliprange)
                # Get the bigger of the two losses
                policy_gradient_loss_surr = torch.max(policy_gradient_losses1,
                                                      policy_gradient_losses2)
                # Get the average policy gradient loss
                policy_gradient_loss = torch.mean(policy_gradient_loss_surr)

                # Get an approximation of the kl-difference between old and new policy
                # probabilities (mean squared difference)
                approx_kl_divergence = 0.5 * torch.mean(
                    (neglogprobs_old - neglogprobs_new.squeeze())**2)
                # Get the fraction of times that the policy gradient loss was clipped
                clipfrac = torch.mean(
                    (torch.abs(policy_gradient_losses2 -
                               policy_gradient_loss_surr) > 1e-6).float())

                # Multiply the policy entropy with the entropy coeficient
                entropy_loss = (-self.entropy_coef) * entropy

                # Calculate the total loss out of the policy gradient loss, the entropy
                # loss (*entropy_coef), the value function loss (*0.5) and feature loss
                total_loss = policy_gradient_loss + entropy_loss + vf_loss + aux_loss
                for i in range(len(dyn_prediction_loss)):
                    # add the loss of each of the dynamics networks to the total loss
                    total_loss = total_loss + dyn_prediction_loss[i]
                # propagate the loss back through the networks
                total_loss.backward()
                self.optimizer.step()
                # set the gradients back to zero
                self.optimizer.zero_grad()

                # Log statistics (divide by nminibatchs * nepochs because we add the
                # loss in these two loops.)
                to_report["loss/total_loss"] += total_loss.cpu().data.numpy(
                ) / (self.nminibatches * self.nepochs)
                to_report[
                    "loss/policy_gradient_loss"] += policy_gradient_loss.cpu(
                    ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["loss/value_loss"] += vf_loss.cpu().data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report["loss/entropy_loss"] += entropy_loss.cpu(
                ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report[
                    "ppo/approx_kl_divergence"] += approx_kl_divergence.cpu(
                    ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["ppo/clipfraction"] += clipfrac.cpu().data.numpy(
                ) / (self.nminibatches * self.nepochs)
                to_report["phi/feature_var_ax01"] += feature_var.cpu(
                ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["phi/feature_var_ax2"] += feature_var_2.cpu(
                ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["loss/auxiliary_task"] += aux_loss.cpu().data.numpy(
                ) / (self.nminibatches * self.nepochs)
                to_report["loss/dynamic_loss"] += np.sum([
                    e.cpu().data.numpy() for e in dyn_prediction_loss
                ]) / (self.nminibatches * self.nepochs)

        info.update(to_report)
        self.n_updates += 1
        info["performance/buffer_external_rewards"] = np.sum(
            self.rollout.buf_ext_rewards)
        # This is especially for the robot_arm environment because the touch sensor
        # magnitude can vary a lot.
        info["performance/buffer_external_rewards_mean"] = np.mean(
            self.rollout.buf_ext_rewards)
        info["performance/buffer_external_rewards_present"] = np.mean(
            self.rollout.buf_ext_rewards > 0)
        info["run/n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["run/updates_per_second"] = 1.0 / (tnow - self.t_last_update)
        self.total_secs = tnow - self.t_start + self.time_trained_so_far
        info["run/total_secs"] = self.total_secs
        info["run/tps"] = self.rollout.nsteps * self.nenvs / (
            tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Ejemplo n.º 9
0
    def ppo_loss(self, obs, acs, neglogprobs, advantages, returns, *args):

        # Reshape actions and put in tensor
        acs = flatten_dims(acs, len(self.ac_space.shape))
        # Update the logits of the newest policy corresponding to the current obs & acs
        self.policy.update_features(obs, acs)
        # Get the negative log probs of the actions under the policy
        neglogprobs_new = self.policy.pd.neglogp(acs.type(torch.LongTensor))
        # Get the entropy of the current policy
        entropy = torch.mean(self.policy.pd.entropy())
        # Get the value estimate of the policies value head
        vpred = self.policy.vpred
        # Calculate the msq difference between value estimate and return
        vf_loss = 0.5 * torch.mean((vpred.squeeze() - returns.detach()) ** 2)
        # Put old neglogprobs from buffer into tensor
        neglogprobs_old = flatten_dims(neglogprobs, 0)
        # Calculate exp difference between old nlp and neglogprobs_new
        # neglogprobs: negative log probability of the action (old)
        # neglogprobs_new: negative log probability of the action (new)
        ratio = torch.exp(neglogprobs_old.detach() - neglogprobs_new.squeeze())
        # Put advantages and negative advantages into tensors
        advantages = flatten_dims(advantages.detach(), 0)
        neg_advantages = -advantages
        # Calculate policy gradient loss. Once multiplied with original ratio
        # between old and new policy probs (1 if identical) and once with
        # clipped ratio.
        policy_gradient_losses1 = neg_advantages * ratio
        policy_gradient_losses2 = neg_advantages * torch.clamp(
            ratio, min=1.0 - self.cliprange, max=1.0 + self.cliprange
        )
        # Get the bigger of the two losses
        policy_gradient_loss_surr = torch.max(
            policy_gradient_losses1, policy_gradient_losses2
        )
        # Get the average policy gradient loss
        policy_gradient_loss = torch.mean(policy_gradient_loss_surr)

        # Get an approximation of the kl-difference between old and new policy
        # probabilities (mean squared difference)
        approx_kl_divergence = 0.5 * torch.mean(
            (neglogprobs_old - neglogprobs_new.squeeze()) ** 2
        )
        # Get the fraction of times that the policy gradient loss was clipped
        clipfrac = torch.mean(
            (torch.abs(policy_gradient_losses2 - policy_gradient_loss_surr) > 1e-6)
            .float()
        )

        # Multiply the policy entropy with the entropy coeficient
        entropy_loss = (-self.entropy_coef) * entropy

        # Calculate the total loss out of the policy gradient loss, the entropy
        # loss (*entropy_coef), the value function loss (*0.5) and feature loss
        # TODO: problem in pg loss and vf loss: Trying to backpropagate a second time
        ppo_loss = policy_gradient_loss + entropy_loss + vf_loss

        return ppo_loss, {
            "ppo/approx_kl_divergence": approx_kl_divergence,
            "ppo/clipfraction": clipfrac,
            "loss/policy_gradient_loss": policy_gradient_loss,
            "loss/value_loss": vf_loss,
            "loss/entropy_loss": entropy_loss,
        }