Example #1
0
 def log_diagnostics(self, paths, prefix=''):
     """
     Log extra information per iteration based on the collected paths
     """
     log_stds = np.vstack(
         [path["agent_infos"]["log_std"] for path in paths])
     logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
Example #2
0
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)
        logger.log("Computing KL before")
        mean_kl_before = self.optimizer.constraint_val(meta_op_input_dict)

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(meta_op_input_dict)
        logger.log("Optimizing")
        self.optimizer.optimize(meta_op_input_dict)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(meta_op_input_dict)

        logger.log("Computing KL after")
        mean_kl = self.optimizer.constraint_val(meta_op_input_dict)
        if log:
            logger.logkv('MeanKLBefore', mean_kl_before)
            logger.logkv('MeanKL', mean_kl)

            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('dLoss', loss_before - loss_after)
Example #3
0
    def optimize_policy(self, samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
Example #4
0
    def log_diagnostics(self, paths, prefix=''):
        progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths]
        ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths]

        logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs))
        logger.logkv(prefix + 'MaxForwardReturn', np.max(progs))
        logger.logkv(prefix + 'MinForwardReturn', np.min(progs))
        logger.logkv(prefix + 'StdForwardReturn', np.std(progs))

        logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))
    def log_diagnostics(self, paths, prefix=''):
        reach_rew = [path["env_infos"]['reachRew'] for path in paths]
        pick_rew = [path["env_infos"]['pickRew'][-1] for path in paths]
        place_rew = [path["env_infos"]['placeRew'] for path in paths]
        reach_dist = [path["env_infos"]['reachDist'] for path in paths]
        placing_dist = [path["env_infos"]['placingDist'] for path in paths]

        logger.logkv(prefix + 'AverageReachReward', np.mean(reach_rew))
        logger.logkv(prefix + 'AveragePickReward', np.mean(pick_rew))
        logger.logkv(prefix + 'AveragePlaceReward', np.mean(place_rew))
        logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
        logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
Example #6
0
    def train(self):

        for i in range(1, self.eff+1):

            with self.sess.as_default() as sess:

                logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------")
                # self.sampler.rollouts_per_meta_task = 10000
                self.sampler.update_batch_size(i)

                # initialize uninitialized vars  (only initialize vars that were not loaded)
                uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
                sess.run(tf.variables_initializer(uninit_vars))

                self.task = self.env.sample_tasks(self.sampler.meta_batch_size, is_eval=True)
                self.sampler.set_tasks(self.task)

                #logger.log("\n ---------------- Iteration %d ----------------" % itr)
                logger.log("Sampling set of tasks/goals for this meta-batch...")

                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                paths = self.sampler.obtain_samples(log=True, log_prefix='train-')

                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-')
                self.log_diagnostics(sum(paths.values(), []), prefix='train-')

                #""" ------------------ Policy Update ---------------------"""

                #logger.log("Optimizing policy...")
                ## This needs to take all samples_data so that it can construct graph for meta-optimization.
                #time_optimization_step_start = time.time()
                #self.algo.optimize_policy(samples_data)

                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)

                #logger.log("Saving snapshot...")
                #params = self.get_itr_snapshot(itr)
                #logger.save_itr_params(itr, params)
                #logger.log("Saved")

                logger.dumpkvs()
                # if itr == 0:
                    # sess.graph.finalize()

            logger.log("Training finished")
        self.sess.close()
Example #7
0
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(all_samples_data, self._optimization_keys)

        # add kl_coeffs / clip_eps to meta_op_input_dict
        meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff

        meta_op_input_dict['clip_eps'] = self.clip_eps

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(input_val_dict=meta_op_input_dict)

        if self.adaptive_inner_kl_penalty:
            if log: logger.log("Updating inner KL loss coefficients")
            self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('KLInner', np.mean(inner_kls))
            logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
Example #8
0
    def optimize_policy(self,
                        all_samples_data,
                        mod_samples_data,
                        num_paths_per_rollout,
                        log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        extra_feed_dict = {
            self.policy.mod_input_var: mod_samples_data,
            self.policy.num_paths_var: num_paths_per_rollout,
        }

        # add kl_coeffs / clip_eps to meta_op_input_dict
        meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff

        meta_op_input_dict['clip_eps'] = self.clip_eps

        if log: logger.log("Optimizing")

        loss_before, grad_norms = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict)
        if self.summary_writer is not None:
            for name, norm in grad_norms.items():
                tensorboard_util.log_scalar(self.summary_writer,
                                            'grads/' + name, norm,
                                            self.log_step)
            self.log_step += 1

        if log: logger.log("Computing statistics")
        loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(
            input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict)

        if self.adaptive_inner_kl_penalty:
            if log: logger.log("Updating inner KL loss coefficients")
            self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff,
                                                      inner_kls,
                                                      self.target_inner_step)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('KLInner', np.mean(inner_kls))
            logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
Example #9
0
    def log_diagnostics(self, paths, prefix=''):
        fwrd_vel = [path["env_infos"]['forward_vel'] for path in paths]
        final_fwrd_vel = [path["env_infos"]['forward_vel'][-1] for path in paths]
        ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths]

        logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel))
        logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel))
        logger.logkv(prefix + 'AvgCtrlCost', np.std(ctrl_cost))
Example #10
0
    def log_diagnostics(self, paths, prefix=''):
        reach_dist = [path["env_infos"]['reachDist'] for path in paths]
        placing_dist = [path["env_infos"]['placeDist'] for path in paths]
        cos_dist = [path["env_infos"]['cosDist'] for path in paths]

        logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
        logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
        logger.logkv(prefix + 'AverageCosDistance', np.mean(cos_dist))
Example #11
0
    def _log_path_stats(self, paths, log=False, log_prefix=''):
        # compute log stats
        average_discounted_return = [
            sum(path["discounted_rewards"]) for path in paths
        ]
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         np.mean(average_discounted_return))
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))
Example #12
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                self.task = self.env.sample_tasks(self.sampler.meta_batch_size)
                self.sampler.set_tasks(self.task)
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")
                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                time_env_sampling_start = time.time()
                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='train-')
                sampling_time = time.time() - time_env_sampling_start
                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                time_proc_samples_start = time.time()
                samples_data = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='train-')
                proc_samples_time = time.time() - time_proc_samples_start

                self.log_diagnostics(sum(paths.values(), []), prefix='train-')
                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_optimization_step_start = time.time()
                self.algo.optimize_policy(samples_data)
                """ ------------------ Test-split Performance for logging ---------------------"""

                logger.log("Testing on test-tasks split for logging...")

                sampler_batch_size = self.sampler.batch_size
                self.sampler.update_batch_size(3)  ####################2

                undiscounted_returns = []
                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    # Caution: Here actually i in [0] since self.meta_batch_size=100(when running on linux)

                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    #self.policy.switch_to_pre_update()  # Switch to pre-update policy

                    logger.log("On Test: Obtaining samples...")
                    paths = self.sampler.obtain_samples(
                        log=False,
                        test=True)  # log_prefix='test-Step_%d-' % step

                    logger.log("On Test: Processing Samples...")
                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='test-')
                    """ ------------------- Logging Returns --------------------"""
                    paths = self.sample_processor.gao_paths(paths)
                    undiscounted_returns.extend(
                        [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                self.sampler.update_batch_size(sampler_batch_size)
                """ ------------------- Logging Stuff --------------------------"""

                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('test-AverageReturn', test_average_return)

                logger.logkv('Time-Optimization',
                             time.time() - time_optimization_step_start)
                logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
                logger.logkv('Time-Sampling', sampling_time)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
Example #13
0
    def _log_path_stats(self,
                        paths,
                        log=False,
                        log_prefix='',
                        experiment=None):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]
        # average_vel = np.mean([path["env_infos"]["forward_vel"] for path in paths])

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))

            if experiment:
                # experiment.log_metric("Average velocity", average_vel)
                experiment.log_metric("maxReturn",
                                      np.max(undiscounted_returns))
                experiment.log_metric("MinReturn",
                                      np.min(undiscounted_returns))
                experiment.log_metric('AverageReturn',
                                      np.mean(undiscounted_returns))
                experiment.log_metric('StdReturn',
                                      np.std(undiscounted_returns))
                experiment.log_metric('AverageDiscountedReturn',
                                      average_discounted_return)
                experiment.log_metric('StdReturn',
                                      np.std(undiscounted_returns))
Example #14
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log("\n ---------------- Iteration %d ----------------" % itr)
                logger.log("Sampling set of tasks/goals for this meta-batch...")

                #self.sampler.update_tasks()
                self.policy.switch_to_pre_update()  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps+1):
                    logger.log('** Step ' + str(step) + ' **')

                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                    list_sampling_time.append(time.time() - time_env_sampling_start)
                    all_paths.append(paths)

                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() - time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step)

                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() - time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)

                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)
                #writer.add_scalar(self.algo.name, self.sample_processor.AR, self.sampler.total_timesteps_sampled)
                logger.logkv('Time-OuterStep', time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()        
Example #15
0
    def train(self):

        policy_0 = self.policy

        for i in [4, 3, 2, 1]:  #range(1, self.eff+1):

            print("On", i, "self.policy == policy_0: ",
                  self.policy == policy_0)

            with self.sess.as_default() as sess:

                logger.log("----------- Adaptation rollouts per meta-task = ",
                           i, " -----------")

                undiscounted_returns = []
                for j in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):

                    logger.log("---------Testing on task", j, "~",
                               j + self.sampler.meta_batch_size - 1,
                               "---------")

                    # initialize uninitialized vars  (only initialize vars that were not loaded)
                    # uninit_vars = [var for var in tf.global_variables() if
                    #                not sess.run(tf.is_variable_initialized(var))]
                    # sess.run(tf.variables_initializer(uninit_vars))

                    uninit_vars = [var for var in tf.global_variables()]
                    sess.run(tf.variables_initializer(uninit_vars))

                    logger.log(
                        "Sampling set of tasks/goals for this meta-batch...")
                    self.sampler.update_tasks(
                        test=True, start_from=j)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):

                        if step < self.num_inner_grad_steps:
                            self.sampler.update_batch_size_v2(
                                i)  ######################
                            logger.log("On step-0: Obtaining samples...")
                        else:
                            self.sampler.update_batch_size(2)
                            logger.log("On step-1: Obtaining samples...")

                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                logger.logkv('x', i)
                logger.logkv('return', test_average_return)
                logger.dumpkvs()

            logger.log("------Testing rollouts per meta-task = ", i,
                       "finished------")
            '''
Example #16
0
    def obtain_samples(self, log=False, log_prefix='', test=False):
        print(
            "--------------obtaining",
            self.total_samples // self.meta_batch_size // self.max_path_length,
            "rollouts_per_task, for", self.meta_batch_size,
            "tasks..--------------")
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]
        print("                runnng_paths length:", len(running_paths))

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.meta_batch_size)

        # initial reset of envs
        obses = self.vec_env.reset()

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
            actions, agent_infos = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            actions = np.concatenate(actions)  # stack meta batch
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx // self.envs_per_task].append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        if not test:
            self.total_timesteps_sampled += self.total_samples
            print("------------self.total_timesteps_sampled:",
                  self.total_timesteps_sampled, "-----------------")
        else:
            print("------------tested on:",
                  self.total_samples // self.max_path_length,
                  " rollouts-----------------")

        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
Example #17
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")

                self.sampler.update_tasks()  # sample tasks!
                self.policy.switch_to_pre_update(
                )  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps + 1):

                    logger.log('** Step ' + str(step) + ' **')
                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    '''
                    if step == self.num_inner_grad_steps:
                        temp = self.sampler.batch_size
                        self.sampler.update_batch_size(2)
                        paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                        self.sampler.update_batch_size(temp)
                    else:
                        paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                    '''
                    paths = self.sampler.obtain_samples(log=True,
                                                        log_prefix='Step_%d-' %
                                                        step)

                    list_sampling_time.append(time.time() -
                                              time_env_sampling_start)
                    all_paths.append(paths)
                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(
                        paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() -
                                                  time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='Step_%d-' % step)
                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() -
                                                time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)
                """ ------------------ Test-split Performance for logging ---------------------"""

                logger.log(
                    "Testing on test-tasks split for logging, rollout_per_task = 20..."
                )
                undiscounted_returns = []

                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("On Test: Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test20-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                logger.logkv('test20-AverageReturn', test_average_return)

                logger.log(
                    "Testing on test-tasks split for logging, rollout_per_task = 2..."
                )
                sampler_batch_size = self.sampler.batch_size
                self.sampler.update_batch_size(2)  ##############
                undiscounted_returns = []

                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("On Test: Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                self.sampler.update_batch_size(sampler_batch_size)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('test-AverageReturn', test_average_return)

                logger.logkv('Time-OuterStep',
                             time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps',
                             time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()
Example #18
0
    def _log_path_stats(self, paths, log=False, log_prefix=''):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            if 'Test' in log_prefix:
                logger.logkv('AverageReturn_all_test_tasks_last',
                             np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))
Example #19
0
    def _log_path_stats(self,
                        paths,
                        log=False,
                        log_prefix='',
                        meta_batch_size=0):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            '''
            logger.logkv(log_prefix + 'AverageReturn-2', np.mean(undiscounted_returns[-2*meta_batch_size:]))
            # will take undiscounted[-meta_batch_size:] when rollouts_per_meta_task < 2
            logger.log("AverageReturn-2 is estimated by the last 2 trajectories...")
            '''

            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))
Example #20
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))
            n_timesteps = 0

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                gradients = []
                for i in range(self.num_sapling_rounds):
                    logger.log("\n ----- Sampling Round %d ---" % i)

                    dry = i < self.num_sapling_rounds - 1

                    if not dry: self.sampler.update_tasks()
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    all_samples_data, all_paths = [], []

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log('** Step ' + str(step) + ' **')

                        logger.log("Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=True, log_prefix='Step_%d-' % step)
                        all_paths.append(paths)

                        logger.log("Processing samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log='all', log_prefix='Step_%d-' % step)
                        all_samples_data.append(samples_data)

                        if not dry:
                            self.log_diagnostics(sum(list(paths.values()), []),
                                                 prefix='Step_%d-' % step)

                        if step < self.num_inner_grad_steps:
                            logger.log("Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                    """ compute gradients """
                    gradients.append(
                        self.algo.compute_gradients(all_samples_data))

                    if not dry:
                        """ ------------ Compute and log gradient variance ------------"""
                        # compute variance of adaptation gradients
                        for step_id in range(self.num_inner_grad_steps):
                            meta_batch_size = len(gradients[0][0])
                            grad_std, grad_rstd = [], []
                            for task_id in range(meta_batch_size):
                                stacked_grads = np.stack([
                                    gradients[round_id][step_id][task_id] for
                                    round_id in range(self.num_sapling_rounds)
                                ],
                                                         axis=1)
                                std = np.std(stacked_grads, axis=1)
                                mean = np.abs(np.mean(stacked_grads, axis=1))
                                grad_std.append(np.mean(std))
                                grad_rstd.append(np.mean(std / mean))

                            logger.logkv('Step_%i-GradientMean', np.mean(mean))
                            logger.logkv('Step_%i-GradientStd' % step_id,
                                         np.mean(grad_std))
                            logger.logkv('Step_%i-GradientRStd' % step_id,
                                         np.mean(grad_rstd))

                        # compute variance of meta gradients
                        stacked_grads = np.stack([
                            gradients[round_id][self.num_inner_grad_steps]
                            for round_id in range(self.num_sapling_rounds)
                        ],
                                                 axis=1)
                        std = np.std(stacked_grads, axis=1)
                        mean = np.abs(np.mean(stacked_grads, axis=1))

                        meta_grad_std = np.mean(std)
                        meta_grad_rstd = np.mean(std / (mean + 1e-8))
                        meta_grad_rvar = np.mean(std**2 / (mean + 1e-8))

                        logger.logkv('Meta-GradientMean', np.mean(mean))
                        logger.logkv('Meta-GradientStd', meta_grad_std)
                        logger.logkv('Meta-GradientRStd', meta_grad_rstd)
                        logger.logkv('Meta-GradientRVariance', meta_grad_rvar)

                        # compute cosine dists
                        cosine_dists = cdist(np.transpose(stacked_grads),
                                             np.transpose(
                                                 np.mean(stacked_grads,
                                                         axis=1).reshape(
                                                             (-1, 1))),
                                             metric='cosine')
                        mean_abs_cos_dist = np.mean(np.abs(cosine_dists))
                        mean_squared_cosine_dists = np.mean(cosine_dists**2)
                        mean_squared_cosine_dists_sqrt = np.sqrt(
                            mean_squared_cosine_dists)

                        logger.logkv('Meta-GradientCosAbs', mean_abs_cos_dist)
                        logger.logkv('Meta-GradientCosVar',
                                     mean_squared_cosine_dists)
                        logger.logkv('Meta-GradientCosStd',
                                     mean_squared_cosine_dists_sqrt)
                        """ ------------------ Outer Policy Update ---------------------"""

                        logger.log("Optimizing policy...")
                        # This needs to take all samples_data so that it can construct graph for meta-optimization.
                        self.algo.optimize_policy(all_samples_data)
                        """ ------------------- Logging Stuff --------------------------"""
                        n_timesteps += (self.num_inner_grad_steps +
                                        1) * self.sampler.total_samples
                        logger.logkv('n_timesteps', n_timesteps)

                        logger.log("Saving snapshot...")
                        params = self.get_itr_snapshot(itr)  # , **kwargs)
                        logger.save_itr_params(itr, params)
                        logger.log("Saved")

                        logger.logkv('Itr', itr)
                        logger.logkv('Time', time.time() - start_time)
                        logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()
Example #21
0
File: base.py Project: naeioi/ProMP
    def _log_path_stats(self, paths, log=False, log_prefix=''):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))
            if 'success' in paths[0]['env_infos']:
                successes = [
                    path['env_infos']['success'].any() for path in paths
                ]
                logger.logkv(log_prefix + 'SuccessRate', np.mean(successes))