Beispiel #1
0
    def optimize_policy(self,
                        samples_data,
                        log=True,
                        prefix='',
                        verbose=False):
        """
        Performs policy optimization

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update
            log (bool) : whether to log statistics

        Returns:
            None
        """
        #input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train')

        if verbose:
            logger.log("Optimizing")
        loss_before = self.optimizer.optimize(data=samples_data)

        if verbose:
            logger.log("Computing statistics")
        loss_after = self.optimizer.loss(data=samples_data)

        if log:
            logger.logkv(prefix + 'LossBefore', loss_before.numpy())
            logger.logkv(prefix + 'LossAfter', loss_after.numpy())
Beispiel #2
0
 def log_diagnostics(self, paths, prefix=''):
     """
     Log extra information per iteration based on the collected paths
     """
     log_stds = np.vstack(
         [path["agent_infos"]["log_std"] for path in paths])
     means = np.vstack([path["agent_infos"]["mean"] for path in paths])
     logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
     logger.logkv(prefix + 'AverageAbsPolicyMean', np.mean(np.abs(means)))
Beispiel #3
0
    def obtain_samples(self, log=False, log_prefix='', buffer=None, random=False):
        """
        Collect batch_size trajectories from each task
        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random
        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        policy = self.policy
        policy.reset(dones=[True] * self.num_rollouts)

        # initial reset of meta_envs
        init_obses = np.array([self.env.reset() for _ in range(self.num_rollouts)])

        observations, actions, means, log_stds, rewards = self.samples(init_obses)

        means = np.array(means).transpose((1, 0, 2))
        log_stds = np.array(log_stds).transpose((1, 0, 2))
        if log_stds.shape[0] == 1:
            log_stds = np.repeat(log_stds, self.num_rollouts, axis=0)
        agent_infos = [dict(mean=mean, log_std=log_std) for mean, log_std in zip(means, log_stds)]
        observations = np.array(observations).transpose((1, 0, 2))
        actions = np.array(actions).transpose((1, 0, 2))
        rewards = np.array(rewards).T
        dones = [[False for _ in range(self.max_path_length)] for _ in range(self.num_rollouts)]
        env_infos = [dict() for _ in range(self.num_rollouts)]
        paths = [dict(observations=obs, actions=act, rewards=rew,
                      dones=done, env_infos=env_info, agent_infos=agent_info) for
                 obs, act, rew, done, env_info, agent_info in
                 zip(observations, actions, rewards, dones, env_infos, agent_infos)]
        self.total_timesteps_sampled += self.total_samples
        logger.logkv('ModelSampler-n_timesteps', self.total_timesteps_sampled)

        return paths
Beispiel #4
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """

        # initialize uninitialized vars  (only initialize vars that were not loaded)

        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            logger.log("\n ---------------- Iteration %d ----------------" % itr)
            logger.log("Sampling set of tasks/goals for this meta-batch...")

            """ -------------------- Sampling --------------------------"""

            logger.log("Obtaining samples...")
            time_env_sampling_start = time.time()
            paths = self.sampler.obtain_samples(log=True, log_prefix='train-')
            sampling_time = time.time() - time_env_sampling_start

            """ ----------------- Processing Samples ---------------------"""

            logger.log("Processing samples...")
            time_proc_samples_start = time.time()
            samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-')
            proc_samples_time = time.time() - time_proc_samples_start

            """ ------------------ Policy Update ---------------------"""

            logger.log("Optimizing policy...")
            # This needs to take all samples_data so that it can construct graph for meta-optimization.
            time_optimization_step_start = time.time()
            self.algo.optimize_policy(samples_data)

            """ ------------------- Logging Stuff --------------------------"""
            logger.logkv('Itr', itr)
            logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)

            logger.logkv('Time-Optimization', time.time() - time_optimization_step_start)
            logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
            logger.logkv('Time-Sampling', sampling_time)

            logger.logkv('Time', time.time() - start_time)
            logger.logkv('ItrTime', time.time() - itr_start_time)

            logger.log("Saving snapshot...")
            params = self.get_itr_snapshot(itr)
            logger.save_itr_params(itr, params)
            logger.log("Saved")

            logger.dumpkvs()

        logger.log("Training finished")
Beispiel #5
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = _get_empty_running_paths_dict()

        if log: pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True])

        # initial reset of meta_envs
        obs = np.asarray(self.env.reset())

        ts = 0

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                action = self.env.action_space.sample()
                agent_info = {}
            else:
                action, agent_info = policy.get_action(obs)
                if action.ndim == 2:
                    action = action[0]
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obs, reward, done, env_info = self.env.step(action)

            ts += 1
            done = done or ts >= self.max_path_length
            if done:
                next_obs = self.env.reset()
                ts = 0

            env_time += time.time() - t

            new_samples = 0

            # append new samples to running paths
            if isinstance(reward, np.ndarray):
                reward = reward[0]
            running_paths["observations"].append(obs)
            running_paths["actions"].append(action)
            running_paths["rewards"].append(reward)
            running_paths["dones"].append(done)
            running_paths["env_infos"].append(env_info)
            running_paths["agent_infos"].append(agent_info)

            # if running path is done, add it to paths and empty the running path
            if done:
                paths.append(
                    dict(
                        observations=np.asarray(running_paths["observations"]),
                        actions=np.asarray(running_paths["actions"]),
                        rewards=np.asarray(running_paths["rewards"]),
                        dones=np.asarray(running_paths["dones"]),
                        env_infos=utils.stack_tensor_dict_list(
                            running_paths["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(
                            running_paths["agent_infos"]),
                    ))
                new_samples += len(running_paths["rewards"])
                running_paths = _get_empty_running_paths_dict()

            if log: pbar.update(new_samples)
            n_samples += new_samples
            obs = next_obs
        if log: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
Beispiel #6
0
    def _log_path_stats(self,
                        paths,
                        log=False,
                        log_prefix='',
                        return_avg_return=False):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))

        return np.mean(undiscounted_returns)
Beispiel #7
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """


        if type(self.steps_per_iter) is tuple:
            steps_per_iter = np.linspace(self.steps_per_iter[0],
                                         self.steps_per_iter[1], self.n_itr).astype(np.int)
        else:
            steps_per_iter = [self.steps_per_iter] * self.n_itr

        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            logger.log("\n ---------------- Iteration %d ----------------" % itr)

            time_env_sampling_start = time.time()

            if self.initial_random_samples and itr == 0:
                logger.log("Obtaining random samples from the environment...")
                env_paths = self.env_sampler.obtain_samples(log=True, random=True, log_prefix='Data-EnvSampler-')

            else:
                logger.log("Obtaining samples from the environment using the policy...")
                env_paths = self.env_sampler.obtain_samples(log=True, log_prefix='Data-EnvSampler-')

            # Add sleeping time to match parallel experiment
            # time.sleep(10)

            logger.record_tabular('Data-TimeEnvSampling', time.time() - time_env_sampling_start)
            logger.log("Processing environment samples...")

            # first processing just for logging purposes
            time_env_samp_proc = time.time()

            samples_data = self.dynamics_sample_processor.process_samples(env_paths, log=True,
                                                                          log_prefix='Data-EnvTrajs-')

            logger.record_tabular('Data-TimeEnvSampleProc', time.time() - time_env_samp_proc)

            ''' --------------- fit dynamics model --------------- '''

            time_fit_start = time.time()

            logger.log("Training dynamics model for %i epochs ..." % (self.dynamics_model_max_epochs))
            self.dynamics_model.fit(samples_data['observations'],
                                    samples_data['actions'],
                                    samples_data['next_observations'],
                                    epochs=self.dynamics_model_max_epochs, verbose=True,
                                    log_tabular=True, prefix='Model-')

            buffer = None if not self.sample_from_buffer else samples_data

            logger.record_tabular('Model-TimeModelFit', time.time() - time_fit_start)

            ''' --------------- MAML steps --------------- '''
            times_dyn_sampling = []
            times_dyn_sample_processing = []
            times_optimization = []
            times_step = []

            for step in range(steps_per_iter[itr]):

                logger.log("\n ---------------- Grad-Step %d ----------------" % int(sum(steps_per_iter[:itr])
                                                                                     + step))
                step_start_time = time.time()

                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples from the model...")
                time_env_sampling_start = time.time()
                paths = self.model_sampler.obtain_samples(log=True, log_prefix='Policy-', buffer=buffer)
                sampling_time = time.time() - time_env_sampling_start

                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples from the model...")
                time_proc_samples_start = time.time()
                samples_data = self.model_sample_processor.process_samples(paths, log='all', log_prefix='Policy-')
                proc_samples_time = time.time() - time_proc_samples_start

                if type(paths) is list:
                    self.log_diagnostics(paths, prefix='Policy-')
                else:
                    self.log_diagnostics(sum(paths.values(), []), prefix='Policy-')

                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                time_optimization_step_start = time.time()
                self.algo.optimize_policy(samples_data)
                optimization_time = time.time() - time_optimization_step_start

                times_dyn_sampling.append(sampling_time)
                times_dyn_sample_processing.append(proc_samples_time)
                times_optimization.append(optimization_time)
                times_step.append(time.time() - step_start_time)

            """ ------------------- Logging Stuff --------------------------"""
            logger.logkv('Iteration', itr)
            logger.logkv('n_timesteps', self.env_sampler.total_timesteps_sampled)
            logger.logkv('Policy-TimeSampleProc', np.sum(times_dyn_sample_processing))
            logger.logkv('Policy-TimeSampling', np.sum(times_dyn_sampling))
            logger.logkv('Policy-TimeAlgoOpt', np.sum(times_optimization))
            logger.logkv('Policy-TimeStep', np.sum(times_step))

            logger.logkv('Time', time.time() - start_time)
            logger.logkv('ItrTime', time.time() - itr_start_time)

            logger.log("Saving snapshot...")
            params = self.get_itr_snapshot(itr)
            logger.save_itr_params(itr, params)
            logger.log("Saved")

            logger.dumpkvs()

        logger.logkv('Trainer-TimeTotal', time.time() - start_time)

        logger.log("Training finished")