Beispiel #1
    def get_predictions(self, ac, features):
        """Get the current prediction of the dynamics model.

            Returns the output of the dynamics network
        TODO: reimplement chunking
        # TODO: refactor this function, too many shape transformations in ac, confusing
        sh = ac.shape  # = [1, nsteps_per_seg]
        ac = flatten_dims(ac,
                          len(self.ac_space.shape))  # shape = [nsteps_per_seg]
        # Turn actions into one hot encoding
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1)  # shape = [nsteps_per_seg, ac_space.n]

        sh = features.shape  # [1, nsteps_per_seg, feature_dim]
        x = flatten_dims(features, 1)  # [nsteps_per_seg, feature_dim]
        assert x.shape[:-1] == ac.shape[:-1]

        # forward pass of actions and features in dynamics net
        x = self.dynamics_net(x, ac)

        # reshape
        x = unflatten_first_dim(x, sh)  # [1, nsteps_per_seg, feature_dim]

        return x
    def decoder(self, z):
        """Run latent space activations through the decoder model, apply spherical
        scaling if needed and get the distribution of the reconstructions.

        z : Tensor
            Latent activations in the VAE after processing in the encoder.

            Reconstruction distribution.

        z_has_timesteps = len(z.shape) == 3
        if z_has_timesteps:
            sh = z.shape
            z = flatten_dims(z, 1)
        # Run the latent vector trhough the decoder model
        z = self.decoder_model(z)
        # reshape if needed
        if z_has_timesteps:
            z = unflatten_first_dim(z, sh)

        # Calculate the scale parameter
        if self.spherical_obs:
            scale = torch.max(self.scale, torch.tensor(-4.0))
            scale = torch.nn.functional.softplus(scale)
            scale = scale * torch.ones(z.shape)
            z, scale = torch.split(z, [4, 4], -3)
            scale = torch.nn.functional.softplus(scale)
        # Return the scaled distribution of the decoder reconstruction.
        return torch.distributions.normal.Normal(z, scale)
    def get_features(self, obs):
        """Get features from the feature network.

        obs : array
            Observation for which to get features.

            Features of the observations.

        # TODO: refactor - too many shape transformations in obs and act, confusing
        has_timesteps = len(obs.shape) == 5
        if has_timesteps:
            sh = obs.shape  # shape=[1, nsteps, H, W, C]
            obs = flatten_dims(obs, len(
                self.ob_space.shape))  # shape=[nsteps, H, W, C]
        # Normalize observations
        obs = (obs - self.ob_mean) / self.ob_std
        # Reshape observations, shape=[nsteps, C, H, W]
        obs = obs.permute([i
                           for i in range(len(obs.shape) - 3)] + [-1, -3, -2])
        # Get features from the features_model
        act = self.features_model(obs)

        if has_timesteps:
            act = unflatten_first_dim(act, sh)

        return act
Beispiel #4
    def get_features(self, obs):
        """Get features from the feature network.

        obs : array
            Observation for which to get features.

            Features of the observations.

        has_timesteps = len(obs.shape) == 5
        if has_timesteps:
            sh = obs.shape  # shape=[1, nsteps, H, W, C]
            obs = flatten_dims(obs, len(
                self.ob_space.shape))  # shape=[nsteps, H, W, C]
        # Normalize observations
        obs = (obs - self.ob_mean) / self.ob_std
        # Reshape observations, shape=[nsteps, C, H, W]
        obs = np.transpose(obs,
                            for i in range(len(obs.shape) - 3)] + [-1, -3, -2])
        # Get features from the features_model
        act = self.features_model(torch.tensor(obs).to(self.device))

        if has_timesteps:
            act = unflatten_first_dim(act, sh)

        return act
Beispiel #5
    def get_loss_partial(self):
        """Get the loss of the dynamics model with dropout. No use_disagreement is
        calculated here because the dynamics models are trained using the prediction
        error. The disagreement is only used as a reward signal for the policy.
        Dropout is added to the loss to enforce some variance between models while still
        using all of the data.

            Mean squared difference between the output and the next features.

        ac =
        sh = ac.shape  # = [1, nsteps_per_seg]
        ac = flatten_dims(ac,
                          len(self.ac_space.shape))  # shape = [nsteps_per_seg]
        # Turn actions into one hot encoding
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1)  # shape = [nsteps_per_seg, ac_space.n]

        features = self.features
        sh = features.shape  # [1, nsteps_per_seg, feature_dim]
        x = flatten_dims(features, 1)  # [nsteps_per_seg, feature_dim]
        assert x.shape[:-1] == ac.shape[:-1]

        # forward pass of actions and features in dynamics net
        x = self.dynamics_net(,

        # reshape
        x = unflatten_first_dim(x, sh)  # [1, nsteps_per_seg, feature_dim]
        # Take the mean-squared diff between out features (input was current
        # features and action) and next features (shape=[1, nsteps_per_seg])
        next_features = self.next_features
        loss = torch.mean((x - next_features)**2, -1)  # mean over frames
        # Apply dropout here to ensure variability between dynamics models. This is done
        # instead of bootstrapping the samples so that all samples can be used to train
        # every model.
        do = torch.nn.Dropout(p=0.2)
        do_loss = do(loss)
        return do_loss  # vector with mse for each feature
    def get_loss(self):
        """Calculate the auxiliary loss (backward loss). This is the cross entropy
        between the predicted action probabilities and the actions actually performed.

            Losses for each action prediction.

        x =[self.features, self.next_features], 2)
        x = flatten_dims(x, 1)
        # Get action probabilities for each action. shape=[nsteps_per_seg, act_dim]
        param = self.fc(x)
        # Create probability distribution from logits.
        idfpd = self.policy.ac_pdtype.pdfromflat(param)
        # Get the actions that were actually performed and flatten.
        # shape=[n_steps_per_seg]
        ac = flatten_dims(, len(self.ac_space.shape))
        # Calculate the cross entropy between the logits of the action predictions and
        # the actual actions. shape=[n_steps_per_seg, 1]
        return idfpd.neglogp(ac)
Beispiel #7
    def get_loss(self):
        """Get the current loss of the dynamics model.

            If use_disagreement=True returns the output of the loss network, otherwise
            the mean squared difference between the output and the next features.

        ac =
        sh = ac.shape  # = [1, nsteps_per_seg]
        ac = flatten_dims(ac,
                          len(self.ac_space.shape))  # shape = [nsteps_per_seg]
        # Turn actions into one hot encoding
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1)  # shape = [nsteps_per_seg, ac_space.n]

        features = self.features
        sh = features.shape  # [1, nsteps_per_seg, feature_dim]
        x = flatten_dims(features, 1)  # [nsteps_per_seg, feature_dim]
        assert x.shape[:-1] == ac.shape[:-1]

        # forward pass of actions and features in dynamics net
        x = self.dynamics_net(,

        # reshape
        x = unflatten_first_dim(x, sh)  # [1, nsteps_per_seg, feature_dim]
        if self.use_disagreement:
            # Return output from dynamics network
            # (shape=[1, nsteps_per_seg, next_feature_dim])
            return x
            # Take the mean-squared diff between out features (input was current
            # features and action) and next features (shape=[1, nsteps_per_seg])
            next_features = self.next_features
            return torch.mean((x - next_features)**2, -1)
    def update(self):
        """Calculate losses and update parameters based on current rollout.

            Dictionary of infos about the current update and training statistics.

        if self.normrew:
            # Normalize the rewards using the running mean and std
            discounted_rewards = np.array([
                for rew in self.rollout.buf_rewards.T
            # rewards_mean, rewards_std, rewards_count
            rewards_mean, rewards_std, rewards_count = mpi_moments(
            # reward forward filter running mean std
            self.reward_stats.update_from_moments(rewards_mean, rewards_std**2,
            rews = self.rollout.buf_rewards / np.sqrt(self.reward_stats.var)
            rews = np.copy(self.rollout.buf_rewards)

        # Calculate advantages using the current rewards and value estimates

        # Initialize and update the info dict for logging
        info = dict()
        info["ppo/advantage_mean"] = self.buf_advantages.mean()
        info["ppo/advantage_std"] = self.buf_advantages.std()
        info["ppo/return_mean"] = self.buf_returns.mean()
        info["ppo/return_std"] = self.buf_returns.std()
        info["ppo/value_est_mean"] = self.rollout.buf_vpreds.mean()
        info["ppo/value_est_std"] = self.rollout.buf_vpreds.std()
        info["ppo/explained_variance"] = explained_variance(
            self.rollout.buf_vpreds.ravel(), self.buf_returns.ravel())
        info["ppo/reward_mean"] = np.mean(self.rollout.buf_rewards)

        if self.rollout.best_ext_return is not None:
            info["performance/best_ext_return"] = self.rollout.best_ext_return
        # TODO: maybe add extra flag for detailed logging so runs are not slowed down
        if not self.debugging:
            feature_stats, stacked_act_feat = self.get_activation_stats(
                self.rollout.buf_acts_features, "activations_features/")
            hidden_stats, stacked_act_pi = self.get_activation_stats(
                self.rollout.buf_acts_pi, "activations_hidden/")

                "activations_features/raw_act_distribution"] = wandb.Histogram(
            info["activations_hidden/raw_act_distribution"] = wandb.Histogram(

            info["ppo/action_distribution"] = wandb.Histogram(

            if self.vLogFreq >= 0 and self.n_updates % self.vLogFreq == 0:
                print(str(self.n_updates) + " updates - logging video.")
                # Reshape images such that they have shape [time,channels,width,height]
                sample_video = np.moveaxis(self.rollout.buf_obs[0], 3, 1)
                # Log buffer video from first env
                info["observations"] = wandb.Video(sample_video,

        to_report = Counter()

        if self.normadv:  # defaults to True
            # normalize advantages
            m, s = get_mean_and_std(self.buf_advantages)
            self.buf_advantages = (self.buf_advantages - m) / (s + 1e-7)
        # Set update hyperparameters
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        # Update the networks & get losses for nepochs * nminibatches
        for _ in range(self.nepochs):
            for start in range(0, self.nenvs * self.nsegs_per_env,
                end = start + envsperbatch
                minibatch_envinds = envinds[
                    start:end]  # minibatch environment indexes
                # Get rollout experiences for current minibatch
                acs = self.rollout.buf_acs[minibatch_envinds]
                rews = self.rollout.buf_rewards[minibatch_envinds]
                neglogprobs = self.rollout.buf_neglogprobs[
                    minibatch_envinds]  # negative log probabilities (action probabilities from pi)
                obs = self.rollout.buf_obs[minibatch_envinds]
                returns = self.buf_returns[minibatch_envinds]
                advantages = self.buf_advantages[minibatch_envinds]
                last_obs = self.rollout.buf_obs_last[minibatch_envinds]

                # Update features of the policy network to minibatch obs and acs
                self.policy.update_features(obs, acs)

                # Update features of the auxiliary network to minibatch obs and acs
                # Using first element in dynamics list is sufficient bc all dynamics
                # models have the same auxiliary task model and features
                # TODO: should the feature model be independent of dynamics?
                    obs, last_obs)
                # Get the loss and variance of the feature model
                aux_loss = torch.mean(
                # Take variance over steps -> [feature_dim] vars -> average
                # This is the average variance in a feature over time
                feature_var = torch.mean(
                              [0, 1]))
                feature_var_2 = torch.mean(

                # disagreement = []
                dyn_prediction_loss = []
                # Loop through dynamics models
                for dynamic in self.dynamics_list:
                    # Get the features of the observations in the dynamics model (just
                    # gets features from the auxiliary model)
                    # Put features into dynamics model and get loss
                    # (if use_disagreement just returns features, therfor here the
                    # partial loss is used for optimizing and loging)
                    # disagreement.append(torch.mean(np.var(dynamic.get_loss(),axis=0)))

                    # Put features into dynamics model and get partial loss (dropout)

                # Reshape actions and put in tensor
                acs = torch.tensor(flatten_dims(acs, len(
                # Get the negative log probs of the actions under the policy
                neglogprobs_new = self.policy.pd.neglogp(acs)
                # Get the entropy of the current policy
                entropy = torch.mean(self.policy.pd.entropy())
                # Get the value estimate of the policies value head
                vpred = self.policy.vpred
                # Calculate the msq difference between value estimate and return
                vf_loss = 0.5 * torch.mean(
                    (vpred.squeeze() - torch.tensor(returns).to(self.device))**
                # Put old neglogprobs from buffer into tensor
                neglogprobs_old = torch.tensor(flatten_dims(neglogprobs,
                # Calculate exp difference between old nlp and neglogprobs_new
                # neglogprobs: negative log probability of the action (old)
                # neglogprobs_new: negative log probability of the action (new)
                ratio = torch.exp(neglogprobs_old - neglogprobs_new.squeeze())
                # Put advantages and negative advantages into tensors
                advantages = flatten_dims(advantages, 0)
                neg_advantages = torch.tensor(-advantages).to(self.device)
                # Calculate policy gradient loss. Once multiplied with original ratio
                # between old and new policy probs (1 if identical) and once with
                # clipped ratio.
                policy_gradient_losses1 = neg_advantages * ratio
                policy_gradient_losses2 = neg_advantages * torch.clamp(
                    ratio, min=1.0 - self.cliprange, max=1.0 + self.cliprange)
                # Get the bigger of the two losses
                policy_gradient_loss_surr = torch.max(policy_gradient_losses1,
                # Get the average policy gradient loss
                policy_gradient_loss = torch.mean(policy_gradient_loss_surr)

                # Get an approximation of the kl-difference between old and new policy
                # probabilities (mean squared difference)
                approx_kl_divergence = 0.5 * torch.mean(
                    (neglogprobs_old - neglogprobs_new.squeeze())**2)
                # Get the fraction of times that the policy gradient loss was clipped
                clipfrac = torch.mean(
                    (torch.abs(policy_gradient_losses2 -
                               policy_gradient_loss_surr) > 1e-6).float())

                # Multiply the policy entropy with the entropy coeficient
                entropy_loss = (-self.entropy_coef) * entropy

                # Calculate the total loss out of the policy gradient loss, the entropy
                # loss (*entropy_coef), the value function loss (*0.5) and feature loss
                total_loss = policy_gradient_loss + entropy_loss + vf_loss + aux_loss
                for i in range(len(dyn_prediction_loss)):
                    # add the loss of each of the dynamics networks to the total loss
                    total_loss = total_loss + dyn_prediction_loss[i]
                # propagate the loss back through the networks
                # set the gradients back to zero

                # Log statistics (divide by nminibatchs * nepochs because we add the
                # loss in these two loops.)
                to_report["loss/total_loss"] += total_loss.cpu().data.numpy(
                ) / (self.nminibatches * self.nepochs)
                    "loss/policy_gradient_loss"] += policy_gradient_loss.cpu(
                    ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["loss/value_loss"] += vf_loss.cpu().data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report["loss/entropy_loss"] += entropy_loss.cpu(
                ).data.numpy() / (self.nminibatches * self.nepochs)
                    "ppo/approx_kl_divergence"] += approx_kl_divergence.cpu(
                    ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["ppo/clipfraction"] += clipfrac.cpu().data.numpy(
                ) / (self.nminibatches * self.nepochs)
                to_report["phi/feature_var_ax01"] += feature_var.cpu(
                ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["phi/feature_var_ax2"] += feature_var_2.cpu(
                ).data.numpy() / (self.nminibatches * self.nepochs)
                to_report["loss/auxiliary_task"] += aux_loss.cpu().data.numpy(
                ) / (self.nminibatches * self.nepochs)
                to_report["loss/dynamic_loss"] += np.sum([
                    e.cpu().data.numpy() for e in dyn_prediction_loss
                ]) / (self.nminibatches * self.nepochs)

        self.n_updates += 1
        info["performance/buffer_external_rewards"] = np.sum(
        # This is especially for the robot_arm environment because the touch sensor
        # magnitude can vary a lot.
        info["performance/buffer_external_rewards_mean"] = np.mean(
        info["performance/buffer_external_rewards_present"] = np.mean(
            self.rollout.buf_ext_rewards > 0)
        info["run/n_updates"] = self.n_updates
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        if "states_visited" in info:
        tnow = time.time()
        info["run/updates_per_second"] = 1.0 / (tnow - self.t_last_update)
        self.total_secs = tnow - self.t_start + self.time_trained_so_far
        info["run/total_secs"] = self.total_secs
        info["run/tps"] = self.rollout.nsteps * self.nenvs / (
            tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
    def ppo_loss(self, obs, acs, neglogprobs, advantages, returns, *args):

        # Reshape actions and put in tensor
        acs = flatten_dims(acs, len(self.ac_space.shape))
        # Update the logits of the newest policy corresponding to the current obs & acs
        self.policy.update_features(obs, acs)
        # Get the negative log probs of the actions under the policy
        neglogprobs_new = self.policy.pd.neglogp(acs.type(torch.LongTensor))
        # Get the entropy of the current policy
        entropy = torch.mean(self.policy.pd.entropy())
        # Get the value estimate of the policies value head
        vpred = self.policy.vpred
        # Calculate the msq difference between value estimate and return
        vf_loss = 0.5 * torch.mean((vpred.squeeze() - returns.detach()) ** 2)
        # Put old neglogprobs from buffer into tensor
        neglogprobs_old = flatten_dims(neglogprobs, 0)
        # Calculate exp difference between old nlp and neglogprobs_new
        # neglogprobs: negative log probability of the action (old)
        # neglogprobs_new: negative log probability of the action (new)
        ratio = torch.exp(neglogprobs_old.detach() - neglogprobs_new.squeeze())
        # Put advantages and negative advantages into tensors
        advantages = flatten_dims(advantages.detach(), 0)
        neg_advantages = -advantages
        # Calculate policy gradient loss. Once multiplied with original ratio
        # between old and new policy probs (1 if identical) and once with
        # clipped ratio.
        policy_gradient_losses1 = neg_advantages * ratio
        policy_gradient_losses2 = neg_advantages * torch.clamp(
            ratio, min=1.0 - self.cliprange, max=1.0 + self.cliprange
        # Get the bigger of the two losses
        policy_gradient_loss_surr = torch.max(
            policy_gradient_losses1, policy_gradient_losses2
        # Get the average policy gradient loss
        policy_gradient_loss = torch.mean(policy_gradient_loss_surr)

        # Get an approximation of the kl-difference between old and new policy
        # probabilities (mean squared difference)
        approx_kl_divergence = 0.5 * torch.mean(
            (neglogprobs_old - neglogprobs_new.squeeze()) ** 2
        # Get the fraction of times that the policy gradient loss was clipped
        clipfrac = torch.mean(
            (torch.abs(policy_gradient_losses2 - policy_gradient_loss_surr) > 1e-6)

        # Multiply the policy entropy with the entropy coeficient
        entropy_loss = (-self.entropy_coef) * entropy

        # Calculate the total loss out of the policy gradient loss, the entropy
        # loss (*entropy_coef), the value function loss (*0.5) and feature loss
        # TODO: problem in pg loss and vf loss: Trying to backpropagate a second time
        ppo_loss = policy_gradient_loss + entropy_loss + vf_loss

        return ppo_loss, {
            "ppo/approx_kl_divergence": approx_kl_divergence,
            "ppo/clipfraction": clipfrac,
            "loss/policy_gradient_loss": policy_gradient_loss,
            "loss/value_loss": vf_loss,
            "loss/entropy_loss": entropy_loss,