def log(self, run_stats, train_stats, start_time): train_stats, meta_train_stats = train_stats # --- visualise behaviour of policy --- if self.iter_idx % self.args.vis_interval == 0: obs_rms = self.envs.venv.obs_rms if self.args.norm_obs_for_policy else None ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None utl_eval.visualise_behaviour( args=self.args, policy=self.policy, image_folder=self.logger.full_output_folder, iter_idx=self.iter_idx, obs_rms=obs_rms, ret_rms=ret_rms, encoder=self.vae.encoder, reward_decoder=self.vae.reward_decoder, state_decoder=self.vae.state_decoder, task_decoder=self.vae.task_decoder, compute_rew_reconstruction_loss=self.vae. compute_rew_reconstruction_loss, compute_state_reconstruction_loss=self.vae. compute_state_reconstruction_loss, compute_task_reconstruction_loss=self.vae. compute_task_reconstruction_loss, compute_kl_loss=self.vae.compute_kl_loss, ) # --- evaluate policy ---- if self.iter_idx % self.args.eval_interval == 0: obs_rms = self.envs.venv.obs_rms if self.args.norm_obs_for_policy else None ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None returns_per_episode = utl_eval.evaluate(args=self.args, policy=self.policy, obs_rms=obs_rms, ret_rms=ret_rms, encoder=self.vae.encoder, iter_idx=self.iter_idx) # log the return avg/std across tasks (=processes) returns_avg = returns_per_episode.mean(dim=0) returns_std = returns_per_episode.std(dim=0) for k in range(len(returns_avg)): self.logger.add('return_avg_per_iter/episode_{}'.format(k + 1), returns_avg[k], self.iter_idx) self.logger.add( 'return_avg_per_frame/episode_{}'.format(k + 1), returns_avg[k], self.frames) self.logger.add('return_std_per_iter/episode_{}'.format(k + 1), returns_std[k], self.iter_idx) self.logger.add( 'return_std_per_frame/episode_{}'.format(k + 1), returns_std[k], self.frames) print( "Updates {}, num timesteps {}, FPS {}, {} \n Mean return (train): {:.5f} \n" .format(self.iter_idx, self.frames, int(self.frames / (time.time() - start_time)), self.vae.rollout_storage.prev_obs.shape, returns_avg[-1].item())) # --- save models --- if self.iter_idx % self.args.save_interval == 0: save_path = os.path.join(self.logger.full_output_folder, 'models') if not os.path.exists(save_path): os.mkdir(save_path) torch.save( self.policy.actor_critic, os.path.join(save_path, "policy{0}.pt".format(self.iter_idx))) torch.save( self.vae.encoder, os.path.join(save_path, "encoder{0}.pt".format(self.iter_idx))) if self.vae.state_decoder is not None: torch.save( self.vae.state_decoder, os.path.join(save_path, "state_decoder{0}.pt".format(self.iter_idx))) if self.vae.reward_decoder is not None: torch.save( self.vae.reward_decoder, os.path.join(save_path, "reward_decoder{0}.pt".format(self.iter_idx))) if self.vae.task_decoder is not None: torch.save( self.vae.task_decoder, os.path.join(save_path, "task_decoder{0}.pt".format(self.iter_idx))) # save normalisation params of envs if self.args.norm_rew_for_policy: # save rolling mean and std rew_rms = self.envs.venv.ret_rms utl.save_obj(rew_rms, save_path, "env_rew_rms{0}.pkl".format(self.iter_idx)) if self.args.norm_obs_for_policy: obs_rms = self.envs.venv.obs_rms utl.save_obj(obs_rms, save_path, "env_obs_rms{0}.pkl".format(self.iter_idx)) # --- log some other things --- if self.iter_idx % self.args.log_interval == 0: self.logger.add('policy_losses/value_loss', train_stats[0], self.iter_idx) self.logger.add('policy_losses/action_loss', train_stats[1], self.iter_idx) self.logger.add('policy_losses/dist_entropy', train_stats[2], self.iter_idx) self.logger.add('policy_losses/sum', train_stats[3], self.iter_idx) self.logger.add('policy/action', run_stats[0][0].float().mean(), self.iter_idx) if hasattr(self.policy.actor_critic, 'logstd'): self.logger.add('policy/action_logstd', self.policy.actor_critic.dist.logstd.mean(), self.iter_idx) self.logger.add('policy/action_logprob', run_stats[1].mean(), self.iter_idx) self.logger.add('policy/value', run_stats[2].mean(), self.iter_idx) self.logger.add('encoder/latent_mean', torch.cat(self.policy_storage.latent_mean).mean(), self.iter_idx) self.logger.add( 'encoder/latent_logvar', torch.cat(self.policy_storage.latent_logvar).mean(), self.iter_idx) # log the average weights and gradients of all models (where applicable) for [model, name ] in [[self.policy.actor_critic, 'policy'], [self.vae.encoder, 'encoder'], [self.vae.reward_decoder, 'reward_decoder'], [self.vae.state_decoder, 'state_transition_decoder'], [self.vae.task_decoder, 'task_decoder']]: if model is not None: param_list = list(model.parameters()) param_mean = np.mean([ param_list[i].data.cpu().numpy().mean() for i in range(len(param_list)) ]) self.logger.add('weights/{}'.format(name), param_mean, self.iter_idx) if name == 'policy': self.logger.add('weights/policy_std', param_list[0].data.mean(), self.iter_idx) if param_list[0].grad is not None: param_grad_mean = np.mean([ param_list[i].grad.cpu().numpy().mean() for i in range(len(param_list)) ]) self.logger.add('gradients/{}'.format(name), param_grad_mean, self.iter_idx)
def __init__(self, args): self.args = args utl.seed(self.args.seed, self.args.deterministic_execution) # calculate number of updates and keep count of frames/iterations self.num_updates = int( args.num_frames) // args.policy_num_steps // args.num_processes self.frames = 0 self.iter_idx = -1 # initialise tensorboard logger self.logger = TBLogger(self.args, self.args.exp_label) # initialise environments self.envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, gamma=args.policy_gamma, device=device, episodes_per_task=self.args.max_rollouts_per_task, normalise_rew=args.norm_rew_for_policy, ret_rms=None, tasks=None) if self.args.single_task_mode: # get the current tasks (which will be num_process many different tasks) self.train_tasks = self.envs.get_task() # set the tasks to the first task (i.e. just a random task) self.train_tasks[1:] = self.train_tasks[0] # make it a list self.train_tasks = [t for t in self.train_tasks] # re-initialise environments with those tasks self.envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, gamma=args.policy_gamma, device=device, episodes_per_task=self.args.max_rollouts_per_task, normalise_rew=args.norm_rew_for_policy, ret_rms=None, tasks=self.train_tasks, ) # save the training tasks so we can evaluate on the same envs later utl.save_obj(self.train_tasks, self.logger.full_output_folder, "train_tasks") else: self.train_tasks = None # calculate what the maximum length of the trajectories is args.max_trajectory_len = self.envs._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task # get policy input dimensions self.args.state_dim = self.envs.observation_space.shape[0] self.args.task_dim = self.envs.task_dim self.args.belief_dim = self.envs.belief_dim self.args.num_states = self.envs.num_states # get policy output (action) dimensions self.args.action_space = self.envs.action_space if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.envs.action_space.shape[0] # initialise policy self.policy_storage = self.initialise_policy_storage() self.policy = self.initialise_policy()
def log(self, run_stats, train_stats, start): """ Evaluate policy, save model, write to tensorboard logger. """ # --- visualise behaviour of policy --- if self.iter_idx % self.args.vis_interval == 0: ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None utl_eval.visualise_behaviour( args=self.args, policy=self.policy, image_folder=self.logger.full_output_folder, iter_idx=self.iter_idx, ret_rms=ret_rms, ) # --- evaluate policy ---- if self.iter_idx % self.args.eval_interval == 0: ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None returns_per_episode = utl_eval.evaluate(args=self.args, policy=self.policy, ret_rms=ret_rms, iter_idx=self.iter_idx) # log the average return across tasks (=processes) returns_avg = returns_per_episode.mean(dim=0) returns_std = returns_per_episode.std(dim=0) for k in range(len(returns_avg)): self.logger.add('return_avg_per_iter/episode_{}'.format(k + 1), returns_avg[k], self.iter_idx) self.logger.add( 'return_avg_per_frame/episode_{}'.format(k + 1), returns_avg[k], self.frames) self.logger.add('return_std_per_iter/episode_{}'.format(k + 1), returns_std[k], self.iter_idx) self.logger.add( 'return_std_per_frame/episode_{}'.format(k + 1), returns_std[k], self.frames) print( "Updates {}, num timesteps {}, FPS {} \n Mean return (train): {:.5f} \n" .format(self.iter_idx, self.frames, int(self.frames / (time.time() - start)), returns_avg[-1].item())) # save model if self.iter_idx % self.args.save_interval == 0: save_path = os.path.join(self.logger.full_output_folder, 'models') if not os.path.exists(save_path): os.mkdir(save_path) idx_labels = [''] if self.args.save_intermediate_models: idx_labels.append(int(self.iter_idx)) for idx_label in idx_labels: torch.save(self.policy.actor_critic, os.path.join(save_path, f"policy{idx_label}.pt")) # save normalisation params of envs if self.args.norm_rew_for_policy: rew_rms = self.envs.venv.ret_rms utl.save_obj(rew_rms, save_path, f"env_rew_rms{idx_label}") # TODO: grab from policy and save? # if self.args.norm_obs_for_policy: # obs_rms = self.envs.venv.obs_rms # utl.save_obj(obs_rms, save_path, f"env_obs_rms{idx_label}") # --- log some other things --- if (self.iter_idx % self.args.log_interval == 0) and (train_stats is not None): train_stats, _ = train_stats self.logger.add('policy_losses/value_loss', train_stats[0], self.iter_idx) self.logger.add('policy_losses/action_loss', train_stats[1], self.iter_idx) self.logger.add('policy_losses/dist_entropy', train_stats[2], self.iter_idx) self.logger.add('policy_losses/sum', train_stats[3], self.iter_idx) # writer.add_scalar('policy/action', action.mean(), j) self.logger.add('policy/action', run_stats[0][0].float().mean(), self.iter_idx) if hasattr(self.policy.actor_critic, 'logstd'): self.logger.add('policy/action_logstd', self.policy.actor_critic.dist.logstd.mean(), self.iter_idx) self.logger.add('policy/action_logprob', run_stats[1].mean(), self.iter_idx) self.logger.add('policy/value', run_stats[2].mean(), self.iter_idx) param_list = list(self.policy.actor_critic.parameters()) param_mean = np.mean([ param_list[i].data.cpu().numpy().mean() for i in range(len(param_list)) ]) param_grad_mean = np.mean([ param_list[i].grad.cpu().numpy().mean() for i in range(len(param_list)) ]) self.logger.add('weights/policy', param_mean, self.iter_idx) self.logger.add('weights/policy_std', param_list[0].data.cpu().mean(), self.iter_idx) self.logger.add('gradients/policy', param_grad_mean, self.iter_idx) self.logger.add('gradients/policy_std', param_list[0].grad.cpu().numpy().mean(), self.iter_idx)
def log(self, run_stats, train_stats, start_time): # --- visualise behaviour of policy --- if (self.iter_idx + 1) % self.args.vis_interval == 0: ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None utl_eval.visualise_behaviour( args=self.args, policy=self.policy, image_folder=self.logger.full_output_folder, iter_idx=self.iter_idx, ret_rms=ret_rms, encoder=self.vae.encoder, reward_decoder=self.vae.reward_decoder, state_decoder=self.vae.state_decoder, task_decoder=self.vae.task_decoder, compute_rew_reconstruction_loss=self.vae. compute_rew_reconstruction_loss, compute_state_reconstruction_loss=self.vae. compute_state_reconstruction_loss, compute_task_reconstruction_loss=self.vae. compute_task_reconstruction_loss, compute_kl_loss=self.vae.compute_kl_loss, tasks=self.train_tasks, ) # --- evaluate policy ---- if (self.iter_idx + 1) % self.args.eval_interval == 0: ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None returns_per_episode = utl_eval.evaluate( args=self.args, policy=self.policy, ret_rms=ret_rms, encoder=self.vae.encoder, iter_idx=self.iter_idx, tasks=self.train_tasks, ) # log the return avg/std across tasks (=processes) returns_avg = returns_per_episode.mean(dim=0) returns_std = returns_per_episode.std(dim=0) for k in range(len(returns_avg)): self.logger.add('return_avg_per_iter/episode_{}'.format(k + 1), returns_avg[k], self.iter_idx) self.logger.add( 'return_avg_per_frame/episode_{}'.format(k + 1), returns_avg[k], self.frames) self.logger.add('return_std_per_iter/episode_{}'.format(k + 1), returns_std[k], self.iter_idx) self.logger.add( 'return_std_per_frame/episode_{}'.format(k + 1), returns_std[k], self.frames) print(f"Updates {self.iter_idx}, " f"Frames {self.frames}, " f"FPS {int(self.frames / (time.time() - start_time))}, " f"\n Mean return (train): {returns_avg[-1].item()} \n") # --- save models --- if (self.iter_idx + 1) % self.args.save_interval == 0: save_path = os.path.join(self.logger.full_output_folder, 'models') if not os.path.exists(save_path): os.mkdir(save_path) idx_labels = [''] if self.args.save_intermediate_models: idx_labels.append(int(self.iter_idx)) for idx_label in idx_labels: torch.save(self.policy.actor_critic, os.path.join(save_path, f"policy{idx_label}.pt")) torch.save(self.vae.encoder, os.path.join(save_path, f"encoder{idx_label}.pt")) if self.vae.state_decoder is not None: torch.save( self.vae.state_decoder, os.path.join(save_path, f"state_decoder{idx_label}.pt")) if self.vae.reward_decoder is not None: torch.save( self.vae.reward_decoder, os.path.join(save_path, f"reward_decoder{idx_label}.pt")) if self.vae.task_decoder is not None: torch.save( self.vae.task_decoder, os.path.join(save_path, f"task_decoder{idx_label}.pt")) # save normalisation params of envs if self.args.norm_rew_for_policy: rew_rms = self.envs.venv.ret_rms utl.save_obj(rew_rms, save_path, f"env_rew_rms{idx_label}") # TODO: grab from policy and save? # if self.args.norm_obs_for_policy: # obs_rms = self.envs.venv.obs_rms # utl.save_obj(obs_rms, save_path, f"env_obs_rms{idx_label}") # --- log some other things --- if ((self.iter_idx + 1) % self.args.log_interval == 0) and (train_stats is not None): self.logger.add('environment/state_max', self.policy_storage.prev_state.max(), self.iter_idx) self.logger.add('environment/state_min', self.policy_storage.prev_state.min(), self.iter_idx) self.logger.add('environment/rew_max', self.policy_storage.rewards_raw.max(), self.iter_idx) self.logger.add('environment/rew_min', self.policy_storage.rewards_raw.min(), self.iter_idx) self.logger.add('policy_losses/value_loss', train_stats[0], self.iter_idx) self.logger.add('policy_losses/action_loss', train_stats[1], self.iter_idx) self.logger.add('policy_losses/dist_entropy', train_stats[2], self.iter_idx) self.logger.add('policy_losses/sum', train_stats[3], self.iter_idx) self.logger.add('policy/action', run_stats[0][0].float().mean(), self.iter_idx) if hasattr(self.policy.actor_critic, 'logstd'): self.logger.add('policy/action_logstd', self.policy.actor_critic.dist.logstd.mean(), self.iter_idx) self.logger.add('policy/action_logprob', run_stats[1].mean(), self.iter_idx) self.logger.add('policy/value', run_stats[2].mean(), self.iter_idx) self.logger.add('encoder/latent_mean', torch.cat(self.policy_storage.latent_mean).mean(), self.iter_idx) self.logger.add( 'encoder/latent_logvar', torch.cat(self.policy_storage.latent_logvar).mean(), self.iter_idx) # log the average weights and gradients of all models (where applicable) for [model, name ] in [[self.policy.actor_critic, 'policy'], [self.vae.encoder, 'encoder'], [self.vae.reward_decoder, 'reward_decoder'], [self.vae.state_decoder, 'state_transition_decoder'], [self.vae.task_decoder, 'task_decoder']]: if model is not None: param_list = list(model.parameters()) param_mean = np.mean([ param_list[i].data.cpu().numpy().mean() for i in range(len(param_list)) ]) self.logger.add('weights/{}'.format(name), param_mean, self.iter_idx) if name == 'policy': self.logger.add('weights/policy_std', param_list[0].data.mean(), self.iter_idx) if param_list[0].grad is not None: param_grad_mean = np.mean([ param_list[i].grad.cpu().numpy().mean() for i in range(len(param_list)) ]) self.logger.add('gradients/{}'.format(name), param_grad_mean, self.iter_idx)