def get_predictions(self, ac, features): """Get the current prediction of the dynamics model. Returns ------- array Returns the output of the dynamics network TODO: reimplement chunking """ # TODO: refactor this function, too many shape transformations in ac, confusing sh = ac.shape # = [1, nsteps_per_seg] ac = flatten_dims(ac, len(self.ac_space.shape)) # shape = [nsteps_per_seg] # Turn actions into one hot encoding ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_( 1, ac.unsqueeze(1).type(torch.int64), 1) # shape = [nsteps_per_seg, ac_space.n] sh = features.shape # [1, nsteps_per_seg, feature_dim] x = flatten_dims(features, 1) # [nsteps_per_seg, feature_dim] assert x.shape[:-1] == ac.shape[:-1] # forward pass of actions and features in dynamics net x = self.dynamics_net(x, ac) # reshape x = unflatten_first_dim(x, sh) # [1, nsteps_per_seg, feature_dim] return x
def decoder(self, z): """Run latent space activations through the decoder model, apply spherical scaling if needed and get the distribution of the reconstructions. Parameters ---------- z : Tensor Latent activations in the VAE after processing in the encoder. Returns ------- tensor Reconstruction distribution. """ z_has_timesteps = len(z.shape) == 3 if z_has_timesteps: sh = z.shape z = flatten_dims(z, 1) # Run the latent vector trhough the decoder model z = self.decoder_model(z) # reshape if needed if z_has_timesteps: z = unflatten_first_dim(z, sh) # Calculate the scale parameter if self.spherical_obs: scale = torch.max(self.scale, torch.tensor(-4.0)) scale = torch.nn.functional.softplus(scale) scale = scale * torch.ones(z.shape) else: z, scale = torch.split(z, [4, 4], -3) scale = torch.nn.functional.softplus(scale) # Return the scaled distribution of the decoder reconstruction. return torch.distributions.normal.Normal(z, scale)
def get_features(self, obs): """Get features from the feature network. Parameters ---------- obs : array Observation for which to get features. Returns ------- array Features of the observations. """ # TODO: refactor - too many shape transformations in obs and act, confusing has_timesteps = len(obs.shape) == 5 if has_timesteps: sh = obs.shape # shape=[1, nsteps, H, W, C] obs = flatten_dims(obs, len( self.ob_space.shape)) # shape=[nsteps, H, W, C] # Normalize observations obs = (obs - self.ob_mean) / self.ob_std # Reshape observations, shape=[nsteps, C, H, W] obs = obs.permute([i for i in range(len(obs.shape) - 3)] + [-1, -3, -2]) # Get features from the features_model act = self.features_model(obs) if has_timesteps: act = unflatten_first_dim(act, sh) return act
def get_features(self, obs): """Get features from the feature network. Parameters ---------- obs : array Observation for which to get features. Returns ------- array Features of the observations. """ has_timesteps = len(obs.shape) == 5 if has_timesteps: sh = obs.shape # shape=[1, nsteps, H, W, C] obs = flatten_dims(obs, len( self.ob_space.shape)) # shape=[nsteps, H, W, C] # Normalize observations obs = (obs - self.ob_mean) / self.ob_std # Reshape observations, shape=[nsteps, C, H, W] obs = np.transpose(obs, [i for i in range(len(obs.shape) - 3)] + [-1, -3, -2]) # Get features from the features_model act = self.features_model(torch.tensor(obs).to(self.device)) if has_timesteps: act = unflatten_first_dim(act, sh) return act
def get_loss_partial(self): """Get the loss of the dynamics model with dropout. No use_disagreement is calculated here because the dynamics models are trained using the prediction error. The disagreement is only used as a reward signal for the policy. Dropout is added to the loss to enforce some variance between models while still using all of the data. Returns ------- array Mean squared difference between the output and the next features. """ ac = self.ac sh = ac.shape # = [1, nsteps_per_seg] ac = flatten_dims(ac, len(self.ac_space.shape)) # shape = [nsteps_per_seg] # Turn actions into one hot encoding ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_( 1, torch.tensor(ac).unsqueeze(1), 1) # shape = [nsteps_per_seg, ac_space.n] features = self.features sh = features.shape # [1, nsteps_per_seg, feature_dim] x = flatten_dims(features, 1) # [nsteps_per_seg, feature_dim] assert x.shape[:-1] == ac.shape[:-1] # forward pass of actions and features in dynamics net x = self.dynamics_net(x.to(self.device), ac.to(self.device)) # reshape x = unflatten_first_dim(x, sh) # [1, nsteps_per_seg, feature_dim] # Take the mean-squared diff between out features (input was current # features and action) and next features (shape=[1, nsteps_per_seg]) next_features = self.next_features loss = torch.mean((x - next_features)**2, -1) # mean over frames # Apply dropout here to ensure variability between dynamics models. This is done # instead of bootstrapping the samples so that all samples can be used to train # every model. do = torch.nn.Dropout(p=0.2) do_loss = do(loss) return do_loss # vector with mse for each feature
def get_loss(self): """Calculate the auxiliary loss (backward loss). This is the cross entropy between the predicted action probabilities and the actions actually performed. Returns ------- tensor Losses for each action prediction. """ x = torch.cat([self.features, self.next_features], 2) x = flatten_dims(x, 1) # Get action probabilities for each action. shape=[nsteps_per_seg, act_dim] param = self.fc(x) # Create probability distribution from logits. idfpd = self.policy.ac_pdtype.pdfromflat(param) # Get the actions that were actually performed and flatten. # shape=[n_steps_per_seg] ac = flatten_dims(self.ac, len(self.ac_space.shape)) # Calculate the cross entropy between the logits of the action predictions and # the actual actions. shape=[n_steps_per_seg, 1] return idfpd.neglogp(ac)
def get_loss(self): """Get the current loss of the dynamics model. Returns ------- array If use_disagreement=True returns the output of the loss network, otherwise the mean squared difference between the output and the next features. """ ac = self.ac sh = ac.shape # = [1, nsteps_per_seg] ac = flatten_dims(ac, len(self.ac_space.shape)) # shape = [nsteps_per_seg] # Turn actions into one hot encoding ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_( 1, torch.tensor(ac).unsqueeze(1), 1) # shape = [nsteps_per_seg, ac_space.n] features = self.features sh = features.shape # [1, nsteps_per_seg, feature_dim] x = flatten_dims(features, 1) # [nsteps_per_seg, feature_dim] assert x.shape[:-1] == ac.shape[:-1] # forward pass of actions and features in dynamics net x = self.dynamics_net(x.to(self.device), ac.to(self.device)) # reshape x = unflatten_first_dim(x, sh) # [1, nsteps_per_seg, feature_dim] if self.use_disagreement: # Return output from dynamics network # (shape=[1, nsteps_per_seg, next_feature_dim]) return x else: # Take the mean-squared diff between out features (input was current # features and action) and next features (shape=[1, nsteps_per_seg]) next_features = self.next_features return torch.mean((x - next_features)**2, -1)
def update(self): """Calculate losses and update parameters based on current rollout. Returns ------- info Dictionary of infos about the current update and training statistics. """ if self.normrew: # Normalize the rewards using the running mean and std discounted_rewards = np.array([ self.reward_forward_filter.update(rew) for rew in self.rollout.buf_rewards.T ]) # rewards_mean, rewards_std, rewards_count rewards_mean, rewards_std, rewards_count = mpi_moments( discounted_rewards.ravel()) # reward forward filter running mean std self.reward_stats.update_from_moments(rewards_mean, rewards_std**2, rewards_count) rews = self.rollout.buf_rewards / np.sqrt(self.reward_stats.var) else: rews = np.copy(self.rollout.buf_rewards) # Calculate advantages using the current rewards and value estimates self.calculate_advantages(rews=rews, use_done=self.use_done, gamma=self.gamma, lam=self.lam) # Initialize and update the info dict for logging info = dict() info["ppo/advantage_mean"] = self.buf_advantages.mean() info["ppo/advantage_std"] = self.buf_advantages.std() info["ppo/return_mean"] = self.buf_returns.mean() info["ppo/return_std"] = self.buf_returns.std() info["ppo/value_est_mean"] = self.rollout.buf_vpreds.mean() info["ppo/value_est_std"] = self.rollout.buf_vpreds.std() info["ppo/explained_variance"] = explained_variance( self.rollout.buf_vpreds.ravel(), self.buf_returns.ravel()) info["ppo/reward_mean"] = np.mean(self.rollout.buf_rewards) if self.rollout.best_ext_return is not None: info["performance/best_ext_return"] = self.rollout.best_ext_return # TODO: maybe add extra flag for detailed logging so runs are not slowed down if not self.debugging: feature_stats, stacked_act_feat = self.get_activation_stats( self.rollout.buf_acts_features, "activations_features/") hidden_stats, stacked_act_pi = self.get_activation_stats( self.rollout.buf_acts_pi, "activations_hidden/") info.update(feature_stats) info.update(hidden_stats) info[ "activations_features/raw_act_distribution"] = wandb.Histogram( stacked_act_feat) info["activations_hidden/raw_act_distribution"] = wandb.Histogram( stacked_act_pi) info["ppo/action_distribution"] = wandb.Histogram( self.rollout.buf_acs.flatten()) if self.vLogFreq >= 0 and self.n_updates % self.vLogFreq == 0: print(str(self.n_updates) + " updates - logging video.") # Reshape images such that they have shape [time,channels,width,height] sample_video = np.moveaxis(self.rollout.buf_obs[0], 3, 1) # Log buffer video from first env info["observations"] = wandb.Video(sample_video, fps=12, format="gif") to_report = Counter() if self.normadv: # defaults to True # normalize advantages m, s = get_mean_and_std(self.buf_advantages) self.buf_advantages = (self.buf_advantages - m) / (s + 1e-7) # Set update hyperparameters envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) # Update the networks & get losses for nepochs * nminibatches for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch minibatch_envinds = envinds[ start:end] # minibatch environment indexes # Get rollout experiences for current minibatch acs = self.rollout.buf_acs[minibatch_envinds] rews = self.rollout.buf_rewards[minibatch_envinds] neglogprobs = self.rollout.buf_neglogprobs[ minibatch_envinds] # negative log probabilities (action probabilities from pi) obs = self.rollout.buf_obs[minibatch_envinds] returns = self.buf_returns[minibatch_envinds] advantages = self.buf_advantages[minibatch_envinds] last_obs = self.rollout.buf_obs_last[minibatch_envinds] # Update features of the policy network to minibatch obs and acs self.policy.update_features(obs, acs) # Update features of the auxiliary network to minibatch obs and acs # Using first element in dynamics list is sufficient bc all dynamics # models have the same auxiliary task model and features # TODO: should the feature model be independent of dynamics? self.dynamics_list[0].auxiliary_task.update_features( obs, last_obs) # Get the loss and variance of the feature model aux_loss = torch.mean( self.dynamics_list[0].auxiliary_task.get_loss()) # Take variance over steps -> [feature_dim] vars -> average # This is the average variance in a feature over time feature_var = torch.mean( torch.var(self.dynamics_list[0].auxiliary_task.features, [0, 1])) feature_var_2 = torch.mean( torch.var(self.dynamics_list[0].auxiliary_task.features, [2])) # disagreement = [] dyn_prediction_loss = [] # Loop through dynamics models for dynamic in self.dynamics_list: # Get the features of the observations in the dynamics model (just # gets features from the auxiliary model) dynamic.update_features() # Put features into dynamics model and get loss # (if use_disagreement just returns features, therfor here the # partial loss is used for optimizing and loging) # disagreement.append(torch.mean(np.var(dynamic.get_loss(),axis=0))) # Put features into dynamics model and get partial loss (dropout) dyn_prediction_loss.append( torch.mean(dynamic.get_loss_partial())) # Reshape actions and put in tensor acs = torch.tensor(flatten_dims(acs, len( self.ac_space.shape))).to(self.device) # Get the negative log probs of the actions under the policy neglogprobs_new = self.policy.pd.neglogp(acs) # Get the entropy of the current policy entropy = torch.mean(self.policy.pd.entropy()) # Get the value estimate of the policies value head vpred = self.policy.vpred # Calculate the msq difference between value estimate and return vf_loss = 0.5 * torch.mean( (vpred.squeeze() - torch.tensor(returns).to(self.device))** 2) # Put old neglogprobs from buffer into tensor neglogprobs_old = torch.tensor(flatten_dims(neglogprobs, 0)).to(self.device) # Calculate exp difference between old nlp and neglogprobs_new # neglogprobs: negative log probability of the action (old) # neglogprobs_new: negative log probability of the action (new) ratio = torch.exp(neglogprobs_old - neglogprobs_new.squeeze()) # Put advantages and negative advantages into tensors advantages = flatten_dims(advantages, 0) neg_advantages = torch.tensor(-advantages).to(self.device) # Calculate policy gradient loss. Once multiplied with original ratio # between old and new policy probs (1 if identical) and once with # clipped ratio. policy_gradient_losses1 = neg_advantages * ratio policy_gradient_losses2 = neg_advantages * torch.clamp( ratio, min=1.0 - self.cliprange, max=1.0 + self.cliprange) # Get the bigger of the two losses policy_gradient_loss_surr = torch.max(policy_gradient_losses1, policy_gradient_losses2) # Get the average policy gradient loss policy_gradient_loss = torch.mean(policy_gradient_loss_surr) # Get an approximation of the kl-difference between old and new policy # probabilities (mean squared difference) approx_kl_divergence = 0.5 * torch.mean( (neglogprobs_old - neglogprobs_new.squeeze())**2) # Get the fraction of times that the policy gradient loss was clipped clipfrac = torch.mean( (torch.abs(policy_gradient_losses2 - policy_gradient_loss_surr) > 1e-6).float()) # Multiply the policy entropy with the entropy coeficient entropy_loss = (-self.entropy_coef) * entropy # Calculate the total loss out of the policy gradient loss, the entropy # loss (*entropy_coef), the value function loss (*0.5) and feature loss total_loss = policy_gradient_loss + entropy_loss + vf_loss + aux_loss for i in range(len(dyn_prediction_loss)): # add the loss of each of the dynamics networks to the total loss total_loss = total_loss + dyn_prediction_loss[i] # propagate the loss back through the networks total_loss.backward() self.optimizer.step() # set the gradients back to zero self.optimizer.zero_grad() # Log statistics (divide by nminibatchs * nepochs because we add the # loss in these two loops.) to_report["loss/total_loss"] += total_loss.cpu().data.numpy( ) / (self.nminibatches * self.nepochs) to_report[ "loss/policy_gradient_loss"] += policy_gradient_loss.cpu( ).data.numpy() / (self.nminibatches * self.nepochs) to_report["loss/value_loss"] += vf_loss.cpu().data.numpy() / ( self.nminibatches * self.nepochs) to_report["loss/entropy_loss"] += entropy_loss.cpu( ).data.numpy() / (self.nminibatches * self.nepochs) to_report[ "ppo/approx_kl_divergence"] += approx_kl_divergence.cpu( ).data.numpy() / (self.nminibatches * self.nepochs) to_report["ppo/clipfraction"] += clipfrac.cpu().data.numpy( ) / (self.nminibatches * self.nepochs) to_report["phi/feature_var_ax01"] += feature_var.cpu( ).data.numpy() / (self.nminibatches * self.nepochs) to_report["phi/feature_var_ax2"] += feature_var_2.cpu( ).data.numpy() / (self.nminibatches * self.nepochs) to_report["loss/auxiliary_task"] += aux_loss.cpu().data.numpy( ) / (self.nminibatches * self.nepochs) to_report["loss/dynamic_loss"] += np.sum([ e.cpu().data.numpy() for e in dyn_prediction_loss ]) / (self.nminibatches * self.nepochs) info.update(to_report) self.n_updates += 1 info["performance/buffer_external_rewards"] = np.sum( self.rollout.buf_ext_rewards) # This is especially for the robot_arm environment because the touch sensor # magnitude can vary a lot. info["performance/buffer_external_rewards_mean"] = np.mean( self.rollout.buf_ext_rewards) info["performance/buffer_external_rewards_present"] = np.mean( self.rollout.buf_ext_rewards > 0) info["run/n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["run/updates_per_second"] = 1.0 / (tnow - self.t_last_update) self.total_secs = tnow - self.t_start + self.time_trained_so_far info["run/total_secs"] = self.total_secs info["run/tps"] = self.rollout.nsteps * self.nenvs / ( tnow - self.t_last_update) self.t_last_update = tnow return info
def ppo_loss(self, obs, acs, neglogprobs, advantages, returns, *args): # Reshape actions and put in tensor acs = flatten_dims(acs, len(self.ac_space.shape)) # Update the logits of the newest policy corresponding to the current obs & acs self.policy.update_features(obs, acs) # Get the negative log probs of the actions under the policy neglogprobs_new = self.policy.pd.neglogp(acs.type(torch.LongTensor)) # Get the entropy of the current policy entropy = torch.mean(self.policy.pd.entropy()) # Get the value estimate of the policies value head vpred = self.policy.vpred # Calculate the msq difference between value estimate and return vf_loss = 0.5 * torch.mean((vpred.squeeze() - returns.detach()) ** 2) # Put old neglogprobs from buffer into tensor neglogprobs_old = flatten_dims(neglogprobs, 0) # Calculate exp difference between old nlp and neglogprobs_new # neglogprobs: negative log probability of the action (old) # neglogprobs_new: negative log probability of the action (new) ratio = torch.exp(neglogprobs_old.detach() - neglogprobs_new.squeeze()) # Put advantages and negative advantages into tensors advantages = flatten_dims(advantages.detach(), 0) neg_advantages = -advantages # Calculate policy gradient loss. Once multiplied with original ratio # between old and new policy probs (1 if identical) and once with # clipped ratio. policy_gradient_losses1 = neg_advantages * ratio policy_gradient_losses2 = neg_advantages * torch.clamp( ratio, min=1.0 - self.cliprange, max=1.0 + self.cliprange ) # Get the bigger of the two losses policy_gradient_loss_surr = torch.max( policy_gradient_losses1, policy_gradient_losses2 ) # Get the average policy gradient loss policy_gradient_loss = torch.mean(policy_gradient_loss_surr) # Get an approximation of the kl-difference between old and new policy # probabilities (mean squared difference) approx_kl_divergence = 0.5 * torch.mean( (neglogprobs_old - neglogprobs_new.squeeze()) ** 2 ) # Get the fraction of times that the policy gradient loss was clipped clipfrac = torch.mean( (torch.abs(policy_gradient_losses2 - policy_gradient_loss_surr) > 1e-6) .float() ) # Multiply the policy entropy with the entropy coeficient entropy_loss = (-self.entropy_coef) * entropy # Calculate the total loss out of the policy gradient loss, the entropy # loss (*entropy_coef), the value function loss (*0.5) and feature loss # TODO: problem in pg loss and vf loss: Trying to backpropagate a second time ppo_loss = policy_gradient_loss + entropy_loss + vf_loss return ppo_loss, { "ppo/approx_kl_divergence": approx_kl_divergence, "ppo/clipfraction": clipfrac, "loss/policy_gradient_loss": policy_gradient_loss, "loss/value_loss": vf_loss, "loss/entropy_loss": entropy_loss, }