Ejemplo n.º 1
0
    def log(self, run_stats, train_stats, start_time):
        train_stats, meta_train_stats = train_stats

        # --- visualise behaviour of policy ---

        if self.iter_idx % self.args.vis_interval == 0:
            obs_rms = self.envs.venv.obs_rms if self.args.norm_obs_for_policy else None
            ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None

            utl_eval.visualise_behaviour(
                args=self.args,
                policy=self.policy,
                image_folder=self.logger.full_output_folder,
                iter_idx=self.iter_idx,
                obs_rms=obs_rms,
                ret_rms=ret_rms,
                encoder=self.vae.encoder,
                reward_decoder=self.vae.reward_decoder,
                state_decoder=self.vae.state_decoder,
                task_decoder=self.vae.task_decoder,
                compute_rew_reconstruction_loss=self.vae.
                compute_rew_reconstruction_loss,
                compute_state_reconstruction_loss=self.vae.
                compute_state_reconstruction_loss,
                compute_task_reconstruction_loss=self.vae.
                compute_task_reconstruction_loss,
                compute_kl_loss=self.vae.compute_kl_loss,
            )

        # --- evaluate policy ----

        if self.iter_idx % self.args.eval_interval == 0:

            obs_rms = self.envs.venv.obs_rms if self.args.norm_obs_for_policy else None
            ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None

            returns_per_episode = utl_eval.evaluate(args=self.args,
                                                    policy=self.policy,
                                                    obs_rms=obs_rms,
                                                    ret_rms=ret_rms,
                                                    encoder=self.vae.encoder,
                                                    iter_idx=self.iter_idx)

            # log the return avg/std across tasks (=processes)
            returns_avg = returns_per_episode.mean(dim=0)
            returns_std = returns_per_episode.std(dim=0)
            for k in range(len(returns_avg)):
                self.logger.add('return_avg_per_iter/episode_{}'.format(k + 1),
                                returns_avg[k], self.iter_idx)
                self.logger.add(
                    'return_avg_per_frame/episode_{}'.format(k + 1),
                    returns_avg[k], self.frames)
                self.logger.add('return_std_per_iter/episode_{}'.format(k + 1),
                                returns_std[k], self.iter_idx)
                self.logger.add(
                    'return_std_per_frame/episode_{}'.format(k + 1),
                    returns_std[k], self.frames)

            print(
                "Updates {}, num timesteps {}, FPS {}, {} \n Mean return (train): {:.5f} \n"
                .format(self.iter_idx, self.frames,
                        int(self.frames / (time.time() - start_time)),
                        self.vae.rollout_storage.prev_obs.shape,
                        returns_avg[-1].item()))

        # --- save models ---

        if self.iter_idx % self.args.save_interval == 0:
            save_path = os.path.join(self.logger.full_output_folder, 'models')
            if not os.path.exists(save_path):
                os.mkdir(save_path)
            torch.save(
                self.policy.actor_critic,
                os.path.join(save_path, "policy{0}.pt".format(self.iter_idx)))
            torch.save(
                self.vae.encoder,
                os.path.join(save_path, "encoder{0}.pt".format(self.iter_idx)))
            if self.vae.state_decoder is not None:
                torch.save(
                    self.vae.state_decoder,
                    os.path.join(save_path,
                                 "state_decoder{0}.pt".format(self.iter_idx)))
            if self.vae.reward_decoder is not None:
                torch.save(
                    self.vae.reward_decoder,
                    os.path.join(save_path,
                                 "reward_decoder{0}.pt".format(self.iter_idx)))
            if self.vae.task_decoder is not None:
                torch.save(
                    self.vae.task_decoder,
                    os.path.join(save_path,
                                 "task_decoder{0}.pt".format(self.iter_idx)))

            # save normalisation params of envs
            if self.args.norm_rew_for_policy:
                # save rolling mean and std
                rew_rms = self.envs.venv.ret_rms
                utl.save_obj(rew_rms, save_path,
                             "env_rew_rms{0}.pkl".format(self.iter_idx))
            if self.args.norm_obs_for_policy:
                obs_rms = self.envs.venv.obs_rms
                utl.save_obj(obs_rms, save_path,
                             "env_obs_rms{0}.pkl".format(self.iter_idx))

            # --- log some other things ---

        if self.iter_idx % self.args.log_interval == 0:

            self.logger.add('policy_losses/value_loss', train_stats[0],
                            self.iter_idx)
            self.logger.add('policy_losses/action_loss', train_stats[1],
                            self.iter_idx)
            self.logger.add('policy_losses/dist_entropy', train_stats[2],
                            self.iter_idx)
            self.logger.add('policy_losses/sum', train_stats[3], self.iter_idx)

            self.logger.add('policy/action', run_stats[0][0].float().mean(),
                            self.iter_idx)
            if hasattr(self.policy.actor_critic, 'logstd'):
                self.logger.add('policy/action_logstd',
                                self.policy.actor_critic.dist.logstd.mean(),
                                self.iter_idx)
            self.logger.add('policy/action_logprob', run_stats[1].mean(),
                            self.iter_idx)
            self.logger.add('policy/value', run_stats[2].mean(), self.iter_idx)

            self.logger.add('encoder/latent_mean',
                            torch.cat(self.policy_storage.latent_mean).mean(),
                            self.iter_idx)
            self.logger.add(
                'encoder/latent_logvar',
                torch.cat(self.policy_storage.latent_logvar).mean(),
                self.iter_idx)

            # log the average weights and gradients of all models (where applicable)
            for [model, name
                 ] in [[self.policy.actor_critic, 'policy'],
                       [self.vae.encoder, 'encoder'],
                       [self.vae.reward_decoder, 'reward_decoder'],
                       [self.vae.state_decoder, 'state_transition_decoder'],
                       [self.vae.task_decoder, 'task_decoder']]:
                if model is not None:
                    param_list = list(model.parameters())
                    param_mean = np.mean([
                        param_list[i].data.cpu().numpy().mean()
                        for i in range(len(param_list))
                    ])
                    self.logger.add('weights/{}'.format(name), param_mean,
                                    self.iter_idx)
                    if name == 'policy':
                        self.logger.add('weights/policy_std',
                                        param_list[0].data.mean(),
                                        self.iter_idx)
                    if param_list[0].grad is not None:
                        param_grad_mean = np.mean([
                            param_list[i].grad.cpu().numpy().mean()
                            for i in range(len(param_list))
                        ])
                        self.logger.add('gradients/{}'.format(name),
                                        param_grad_mean, self.iter_idx)
Ejemplo n.º 2
0
    def __init__(self, args):

        self.args = args
        utl.seed(self.args.seed, self.args.deterministic_execution)

        # calculate number of updates and keep count of frames/iterations
        self.num_updates = int(
            args.num_frames) // args.policy_num_steps // args.num_processes
        self.frames = 0
        self.iter_idx = -1

        # initialise tensorboard logger
        self.logger = TBLogger(self.args, self.args.exp_label)

        # initialise environments
        self.envs = make_vec_envs(
            env_name=args.env_name,
            seed=args.seed,
            num_processes=args.num_processes,
            gamma=args.policy_gamma,
            device=device,
            episodes_per_task=self.args.max_rollouts_per_task,
            normalise_rew=args.norm_rew_for_policy,
            ret_rms=None,
            tasks=None)

        if self.args.single_task_mode:
            # get the current tasks (which will be num_process many different tasks)
            self.train_tasks = self.envs.get_task()
            # set the tasks to the first task (i.e. just a random task)
            self.train_tasks[1:] = self.train_tasks[0]
            # make it a list
            self.train_tasks = [t for t in self.train_tasks]
            # re-initialise environments with those tasks
            self.envs = make_vec_envs(
                env_name=args.env_name,
                seed=args.seed,
                num_processes=args.num_processes,
                gamma=args.policy_gamma,
                device=device,
                episodes_per_task=self.args.max_rollouts_per_task,
                normalise_rew=args.norm_rew_for_policy,
                ret_rms=None,
                tasks=self.train_tasks,
            )
            # save the training tasks so we can evaluate on the same envs later
            utl.save_obj(self.train_tasks, self.logger.full_output_folder,
                         "train_tasks")
        else:
            self.train_tasks = None

        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = self.envs._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task

        # get policy input dimensions
        self.args.state_dim = self.envs.observation_space.shape[0]
        self.args.task_dim = self.envs.task_dim
        self.args.belief_dim = self.envs.belief_dim
        self.args.num_states = self.envs.num_states
        # get policy output (action) dimensions
        self.args.action_space = self.envs.action_space
        if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.envs.action_space.shape[0]

        # initialise policy
        self.policy_storage = self.initialise_policy_storage()
        self.policy = self.initialise_policy()
Ejemplo n.º 3
0
    def log(self, run_stats, train_stats, start):
        """
        Evaluate policy, save model, write to tensorboard logger.
        """

        # --- visualise behaviour of policy ---

        if self.iter_idx % self.args.vis_interval == 0:

            ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None
            utl_eval.visualise_behaviour(
                args=self.args,
                policy=self.policy,
                image_folder=self.logger.full_output_folder,
                iter_idx=self.iter_idx,
                ret_rms=ret_rms,
            )

        # --- evaluate policy ----

        if self.iter_idx % self.args.eval_interval == 0:

            ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None

            returns_per_episode = utl_eval.evaluate(args=self.args,
                                                    policy=self.policy,
                                                    ret_rms=ret_rms,
                                                    iter_idx=self.iter_idx)

            # log the average return across tasks (=processes)
            returns_avg = returns_per_episode.mean(dim=0)
            returns_std = returns_per_episode.std(dim=0)
            for k in range(len(returns_avg)):
                self.logger.add('return_avg_per_iter/episode_{}'.format(k + 1),
                                returns_avg[k], self.iter_idx)
                self.logger.add(
                    'return_avg_per_frame/episode_{}'.format(k + 1),
                    returns_avg[k], self.frames)
                self.logger.add('return_std_per_iter/episode_{}'.format(k + 1),
                                returns_std[k], self.iter_idx)
                self.logger.add(
                    'return_std_per_frame/episode_{}'.format(k + 1),
                    returns_std[k], self.frames)

            print(
                "Updates {}, num timesteps {}, FPS {} \n Mean return (train): {:.5f} \n"
                .format(self.iter_idx, self.frames,
                        int(self.frames / (time.time() - start)),
                        returns_avg[-1].item()))

        # save model
        if self.iter_idx % self.args.save_interval == 0:
            save_path = os.path.join(self.logger.full_output_folder, 'models')
            if not os.path.exists(save_path):
                os.mkdir(save_path)

            idx_labels = ['']
            if self.args.save_intermediate_models:
                idx_labels.append(int(self.iter_idx))

            for idx_label in idx_labels:

                torch.save(self.policy.actor_critic,
                           os.path.join(save_path, f"policy{idx_label}.pt"))

                # save normalisation params of envs
                if self.args.norm_rew_for_policy:
                    rew_rms = self.envs.venv.ret_rms
                    utl.save_obj(rew_rms, save_path, f"env_rew_rms{idx_label}")
                # TODO: grab from policy and save?
                # if self.args.norm_obs_for_policy:
                #     obs_rms = self.envs.venv.obs_rms
                #     utl.save_obj(obs_rms, save_path, f"env_obs_rms{idx_label}")

        # --- log some other things ---

        if (self.iter_idx % self.args.log_interval == 0) and (train_stats
                                                              is not None):

            train_stats, _ = train_stats

            self.logger.add('policy_losses/value_loss', train_stats[0],
                            self.iter_idx)
            self.logger.add('policy_losses/action_loss', train_stats[1],
                            self.iter_idx)
            self.logger.add('policy_losses/dist_entropy', train_stats[2],
                            self.iter_idx)
            self.logger.add('policy_losses/sum', train_stats[3], self.iter_idx)

            # writer.add_scalar('policy/action', action.mean(), j)
            self.logger.add('policy/action', run_stats[0][0].float().mean(),
                            self.iter_idx)
            if hasattr(self.policy.actor_critic, 'logstd'):
                self.logger.add('policy/action_logstd',
                                self.policy.actor_critic.dist.logstd.mean(),
                                self.iter_idx)
            self.logger.add('policy/action_logprob', run_stats[1].mean(),
                            self.iter_idx)
            self.logger.add('policy/value', run_stats[2].mean(), self.iter_idx)

            param_list = list(self.policy.actor_critic.parameters())
            param_mean = np.mean([
                param_list[i].data.cpu().numpy().mean()
                for i in range(len(param_list))
            ])
            param_grad_mean = np.mean([
                param_list[i].grad.cpu().numpy().mean()
                for i in range(len(param_list))
            ])
            self.logger.add('weights/policy', param_mean, self.iter_idx)
            self.logger.add('weights/policy_std',
                            param_list[0].data.cpu().mean(), self.iter_idx)
            self.logger.add('gradients/policy', param_grad_mean, self.iter_idx)
            self.logger.add('gradients/policy_std',
                            param_list[0].grad.cpu().numpy().mean(),
                            self.iter_idx)
Ejemplo n.º 4
0
    def log(self, run_stats, train_stats, start_time):

        # --- visualise behaviour of policy ---

        if (self.iter_idx + 1) % self.args.vis_interval == 0:
            ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None
            utl_eval.visualise_behaviour(
                args=self.args,
                policy=self.policy,
                image_folder=self.logger.full_output_folder,
                iter_idx=self.iter_idx,
                ret_rms=ret_rms,
                encoder=self.vae.encoder,
                reward_decoder=self.vae.reward_decoder,
                state_decoder=self.vae.state_decoder,
                task_decoder=self.vae.task_decoder,
                compute_rew_reconstruction_loss=self.vae.
                compute_rew_reconstruction_loss,
                compute_state_reconstruction_loss=self.vae.
                compute_state_reconstruction_loss,
                compute_task_reconstruction_loss=self.vae.
                compute_task_reconstruction_loss,
                compute_kl_loss=self.vae.compute_kl_loss,
                tasks=self.train_tasks,
            )

        # --- evaluate policy ----

        if (self.iter_idx + 1) % self.args.eval_interval == 0:

            ret_rms = self.envs.venv.ret_rms if self.args.norm_rew_for_policy else None
            returns_per_episode = utl_eval.evaluate(
                args=self.args,
                policy=self.policy,
                ret_rms=ret_rms,
                encoder=self.vae.encoder,
                iter_idx=self.iter_idx,
                tasks=self.train_tasks,
            )

            # log the return avg/std across tasks (=processes)
            returns_avg = returns_per_episode.mean(dim=0)
            returns_std = returns_per_episode.std(dim=0)
            for k in range(len(returns_avg)):
                self.logger.add('return_avg_per_iter/episode_{}'.format(k + 1),
                                returns_avg[k], self.iter_idx)
                self.logger.add(
                    'return_avg_per_frame/episode_{}'.format(k + 1),
                    returns_avg[k], self.frames)
                self.logger.add('return_std_per_iter/episode_{}'.format(k + 1),
                                returns_std[k], self.iter_idx)
                self.logger.add(
                    'return_std_per_frame/episode_{}'.format(k + 1),
                    returns_std[k], self.frames)

            print(f"Updates {self.iter_idx}, "
                  f"Frames {self.frames}, "
                  f"FPS {int(self.frames / (time.time() - start_time))}, "
                  f"\n Mean return (train): {returns_avg[-1].item()} \n")

        # --- save models ---

        if (self.iter_idx + 1) % self.args.save_interval == 0:
            save_path = os.path.join(self.logger.full_output_folder, 'models')
            if not os.path.exists(save_path):
                os.mkdir(save_path)

            idx_labels = ['']
            if self.args.save_intermediate_models:
                idx_labels.append(int(self.iter_idx))

            for idx_label in idx_labels:

                torch.save(self.policy.actor_critic,
                           os.path.join(save_path, f"policy{idx_label}.pt"))
                torch.save(self.vae.encoder,
                           os.path.join(save_path, f"encoder{idx_label}.pt"))
                if self.vae.state_decoder is not None:
                    torch.save(
                        self.vae.state_decoder,
                        os.path.join(save_path,
                                     f"state_decoder{idx_label}.pt"))
                if self.vae.reward_decoder is not None:
                    torch.save(
                        self.vae.reward_decoder,
                        os.path.join(save_path,
                                     f"reward_decoder{idx_label}.pt"))
                if self.vae.task_decoder is not None:
                    torch.save(
                        self.vae.task_decoder,
                        os.path.join(save_path, f"task_decoder{idx_label}.pt"))

                # save normalisation params of envs
                if self.args.norm_rew_for_policy:
                    rew_rms = self.envs.venv.ret_rms
                    utl.save_obj(rew_rms, save_path, f"env_rew_rms{idx_label}")
                # TODO: grab from policy and save?
                # if self.args.norm_obs_for_policy:
                #     obs_rms = self.envs.venv.obs_rms
                #     utl.save_obj(obs_rms, save_path, f"env_obs_rms{idx_label}")

        # --- log some other things ---

        if ((self.iter_idx + 1) % self.args.log_interval
                == 0) and (train_stats is not None):

            self.logger.add('environment/state_max',
                            self.policy_storage.prev_state.max(),
                            self.iter_idx)
            self.logger.add('environment/state_min',
                            self.policy_storage.prev_state.min(),
                            self.iter_idx)

            self.logger.add('environment/rew_max',
                            self.policy_storage.rewards_raw.max(),
                            self.iter_idx)
            self.logger.add('environment/rew_min',
                            self.policy_storage.rewards_raw.min(),
                            self.iter_idx)

            self.logger.add('policy_losses/value_loss', train_stats[0],
                            self.iter_idx)
            self.logger.add('policy_losses/action_loss', train_stats[1],
                            self.iter_idx)
            self.logger.add('policy_losses/dist_entropy', train_stats[2],
                            self.iter_idx)
            self.logger.add('policy_losses/sum', train_stats[3], self.iter_idx)

            self.logger.add('policy/action', run_stats[0][0].float().mean(),
                            self.iter_idx)
            if hasattr(self.policy.actor_critic, 'logstd'):
                self.logger.add('policy/action_logstd',
                                self.policy.actor_critic.dist.logstd.mean(),
                                self.iter_idx)
            self.logger.add('policy/action_logprob', run_stats[1].mean(),
                            self.iter_idx)
            self.logger.add('policy/value', run_stats[2].mean(), self.iter_idx)

            self.logger.add('encoder/latent_mean',
                            torch.cat(self.policy_storage.latent_mean).mean(),
                            self.iter_idx)
            self.logger.add(
                'encoder/latent_logvar',
                torch.cat(self.policy_storage.latent_logvar).mean(),
                self.iter_idx)

            # log the average weights and gradients of all models (where applicable)
            for [model, name
                 ] in [[self.policy.actor_critic, 'policy'],
                       [self.vae.encoder, 'encoder'],
                       [self.vae.reward_decoder, 'reward_decoder'],
                       [self.vae.state_decoder, 'state_transition_decoder'],
                       [self.vae.task_decoder, 'task_decoder']]:
                if model is not None:
                    param_list = list(model.parameters())
                    param_mean = np.mean([
                        param_list[i].data.cpu().numpy().mean()
                        for i in range(len(param_list))
                    ])
                    self.logger.add('weights/{}'.format(name), param_mean,
                                    self.iter_idx)
                    if name == 'policy':
                        self.logger.add('weights/policy_std',
                                        param_list[0].data.mean(),
                                        self.iter_idx)
                    if param_list[0].grad is not None:
                        param_grad_mean = np.mean([
                            param_list[i].grad.cpu().numpy().mean()
                            for i in range(len(param_list))
                        ])
                        self.logger.add('gradients/{}'.format(name),
                                        param_grad_mean, self.iter_idx)