コード例 #1
0
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)
        logger.log("Computing KL before")
        mean_kl_before = self.optimizer.constraint_val(meta_op_input_dict)

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(meta_op_input_dict)
        logger.log("Optimizing")
        self.optimizer.optimize(meta_op_input_dict)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(meta_op_input_dict)

        logger.log("Computing KL after")
        mean_kl = self.optimizer.constraint_val(meta_op_input_dict)
        if log:
            logger.logkv('MeanKLBefore', mean_kl_before)
            logger.logkv('MeanKL', mean_kl)

            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('dLoss', loss_before - loss_after)
コード例 #2
0
    def optimize(self, input_val_dict):
        """
        Carries out the optimization step

        Args:
            input_val_dict (dict): dict containing the values to be fed into the computation graph

        Returns:
            (float) loss before optimization

        """

        sess = tf.get_default_session()
        feed_dict = self.create_feed_dict(input_val_dict)

        # Overload self._batch size
        # dataset = MAMLBatchDataset(inputs, num_batches=self._batch_size, extra_inputs=extra_inputs, meta_batch_size=self.meta_batch_size, num_grad_updates=self.num_grad_updates)
        # Todo: reimplement minibatches

        loss_before_opt = None
        for epoch in range(self._max_epochs):
            if self._verbose:
                logger.log("Epoch %d" % epoch)

            loss, _ = sess.run([self._loss, self._train_op], feed_dict)
            if not loss_before_opt: loss_before_opt = loss

            # if self._verbose:
            #     logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
            #
            # if abs(last_loss - new_loss) < self._tolerance:
            #     break
            # last_loss = new_loss
        return loss_before_opt
コード例 #3
0
    def optimize(self, input_val_dict):
        """
        Carries out the optimization step

        Args:
            input_val_dict (dict): dict containing the values to be fed into the computation graph

        Returns:
            (float) loss before optimization

        """

        sess = tf.get_default_session()
        feed_dict = self.create_feed_dict(input_val_dict)

        loss_before_opt = None
        for epoch in range(self._max_epochs):
            if self._verbose:
                logger.log("Epoch %d" % epoch)

            loss, _ = sess.run([self._loss, self._train_op], feed_dict)
            if not loss_before_opt:
                loss_before_opt = loss

        return loss_before_opt
コード例 #4
0
    def optimize_policy(self, samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
コード例 #5
0
 def compute_gradients(self, all_samples_data, log=True):
     meta_op_input_dict = self._extract_input_dict_meta_op(
         all_samples_data, self._optimization_keys)
     feed_dict = utils.create_feed_dict(
         placeholder_dict=self.meta_op_phs_dict,
         value_dict=meta_op_input_dict)
     if log: logger.log("compute gradients")
     gradients_values = tf.get_default_session().run(self.gradients,
                                                     feed_dict=feed_dict)
     return gradients_values
コード例 #6
0
    def train(self):

        for i in range(1, self.eff+1):

            with self.sess.as_default() as sess:

                logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------")
                # self.sampler.rollouts_per_meta_task = 10000
                self.sampler.update_batch_size(i)

                # initialize uninitialized vars  (only initialize vars that were not loaded)
                uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
                sess.run(tf.variables_initializer(uninit_vars))

                self.task = self.env.sample_tasks(self.sampler.meta_batch_size, is_eval=True)
                self.sampler.set_tasks(self.task)

                #logger.log("\n ---------------- Iteration %d ----------------" % itr)
                logger.log("Sampling set of tasks/goals for this meta-batch...")

                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                paths = self.sampler.obtain_samples(log=True, log_prefix='train-')

                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-')
                self.log_diagnostics(sum(paths.values(), []), prefix='train-')

                #""" ------------------ Policy Update ---------------------"""

                #logger.log("Optimizing policy...")
                ## This needs to take all samples_data so that it can construct graph for meta-optimization.
                #time_optimization_step_start = time.time()
                #self.algo.optimize_policy(samples_data)

                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)

                #logger.log("Saving snapshot...")
                #params = self.get_itr_snapshot(itr)
                #logger.save_itr_params(itr, params)
                #logger.log("Saved")

                logger.dumpkvs()
                # if itr == 0:
                    # sess.graph.finalize()

            logger.log("Training finished")
        self.sess.close()
コード例 #7
0
    def optimize(self, input_val_dict):
        """
        Carries out the optimization step

        Args:
            input_val_dict (dict): dict containing the values to be fed into the computation graph

        Returns:
            (float) loss before optimization

        """

        sess = tf.get_default_session()
        batch_size, seq_len, *_ = list(input_val_dict.values())[0].shape

        loss_before_opt = None
        for epoch in range(self._max_epochs):
            hidden_batch = self._target.get_zero_state(batch_size)
            if self._verbose:
                logger.log("Epoch %d" % epoch)
            # run train op
            loss = []
            all_grads = []

            for i in range(0, seq_len, self._backprop_steps):
                n_i = i + self._backprop_steps
                feed_dict = dict([(self._input_ph_dict[key],
                                   input_val_dict[key][:, i:n_i])
                                  for key in self._input_ph_dict.keys()])
                feed_dict[self._hidden_ph] = hidden_batch
                batch_loss, grads, hidden_batch = sess.run(
                    [self._loss, self._gradients_var, self._next_hidden_var],
                    feed_dict=feed_dict)
                loss.append(batch_loss)
                all_grads.append(grads)

            grads = [np.mean(grad, axis=0) for grad in zip(*all_grads)]
            feed_dict = dict(zip(self._gradients_ph, grads))
            _ = sess.run(self._train_op, feed_dict=feed_dict)

            if not loss_before_opt: loss_before_opt = np.mean(loss)

            # if self._verbose:
            #     logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
            #
            # if abs(last_loss - new_loss) < self._tolerance:
            #     break
            # last_loss = new_loss
        return loss_before_opt
コード例 #8
0
ファイル: pro_mp.py プロジェクト: ZJUGuoShuai/NG-MAML
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(all_samples_data, self._optimization_keys)

        # add kl_coeffs / clip_eps to meta_op_input_dict
        meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff

        meta_op_input_dict['clip_eps'] = self.clip_eps

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(input_val_dict=meta_op_input_dict)

        if self.adaptive_inner_kl_penalty:
            if log: logger.log("Updating inner KL loss coefficients")
            self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('KLInner', np.mean(inner_kls))
            logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
コード例 #9
0
ファイル: mumo_pro_mp.py プロジェクト: Zber5/MMAML-rl
    def optimize_policy(self,
                        all_samples_data,
                        mod_samples_data,
                        num_paths_per_rollout,
                        log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        extra_feed_dict = {
            self.policy.mod_input_var: mod_samples_data,
            self.policy.num_paths_var: num_paths_per_rollout,
        }

        # add kl_coeffs / clip_eps to meta_op_input_dict
        meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff

        meta_op_input_dict['clip_eps'] = self.clip_eps

        if log: logger.log("Optimizing")

        loss_before, grad_norms = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict)
        if self.summary_writer is not None:
            for name, norm in grad_norms.items():
                tensorboard_util.log_scalar(self.summary_writer,
                                            'grads/' + name, norm,
                                            self.log_step)
            self.log_step += 1

        if log: logger.log("Computing statistics")
        loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(
            input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict)

        if self.adaptive_inner_kl_penalty:
            if log: logger.log("Updating inner KL loss coefficients")
            self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff,
                                                      inner_kls,
                                                      self.target_inner_step)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('KLInner', np.mean(inner_kls))
            logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
コード例 #10
0
ファイル: meta_trainer.py プロジェクト: MinorJerry/NG_MAML
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log("\n ---------------- Iteration %d ----------------" % itr)
                logger.log("Sampling set of tasks/goals for this meta-batch...")

                #self.sampler.update_tasks()
                self.policy.switch_to_pre_update()  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps+1):
                    logger.log('** Step ' + str(step) + ' **')

                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                    list_sampling_time.append(time.time() - time_env_sampling_start)
                    all_paths.append(paths)

                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() - time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step)

                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() - time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)

                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)
                #writer.add_scalar(self.algo.name, self.sample_processor.AR, self.sampler.total_timesteps_sampled)
                logger.logkv('Time-OuterStep', time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()        
コード例 #11
0
    def train(self):

        policy_0 = self.policy

        for i in [4, 3, 2, 1]:  #range(1, self.eff+1):

            print("On", i, "self.policy == policy_0: ",
                  self.policy == policy_0)

            with self.sess.as_default() as sess:

                logger.log("----------- Adaptation rollouts per meta-task = ",
                           i, " -----------")

                undiscounted_returns = []
                for j in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):

                    logger.log("---------Testing on task", j, "~",
                               j + self.sampler.meta_batch_size - 1,
                               "---------")

                    # initialize uninitialized vars  (only initialize vars that were not loaded)
                    # uninit_vars = [var for var in tf.global_variables() if
                    #                not sess.run(tf.is_variable_initialized(var))]
                    # sess.run(tf.variables_initializer(uninit_vars))

                    uninit_vars = [var for var in tf.global_variables()]
                    sess.run(tf.variables_initializer(uninit_vars))

                    logger.log(
                        "Sampling set of tasks/goals for this meta-batch...")
                    self.sampler.update_tasks(
                        test=True, start_from=j)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):

                        if step < self.num_inner_grad_steps:
                            self.sampler.update_batch_size_v2(
                                i)  ######################
                            logger.log("On step-0: Obtaining samples...")
                        else:
                            self.sampler.update_batch_size(2)
                            logger.log("On step-1: Obtaining samples...")

                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                logger.logkv('x', i)
                logger.logkv('return', test_average_return)
                logger.dumpkvs()

            logger.log("------Testing rollouts per meta-task = ", i,
                       "finished------")
            '''
コード例 #12
0
def main(config):
    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=gpu_config)

    saver = tf.train.Saver(
        keep_checkpoint_every_n_hours=config['keep_checkpoint_every_n_hours'],
        max_to_keep=config['max_checkpoints_to_keep'])

    save_path = os.path.join(args.dump_path, 'model.ckpt')

    if config['restore_path'] is not None:
        logger.log('Restoring parameters from {}'.format(
            config['restore_path']))
        saver.restore(sess, config['restore_path'])
        logger.log('Restored')

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        saver=saver,
        save_path=save_path,
        save_steps=config['save_steps'],
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        sess=sess,
    )

    trainer.train()
コード例 #13
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")

                self.sampler.update_tasks()  # sample tasks!
                self.policy.switch_to_pre_update(
                )  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps + 1):

                    logger.log('** Step ' + str(step) + ' **')
                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    '''
                    if step == self.num_inner_grad_steps:
                        temp = self.sampler.batch_size
                        self.sampler.update_batch_size(2)
                        paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                        self.sampler.update_batch_size(temp)
                    else:
                        paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                    '''
                    paths = self.sampler.obtain_samples(log=True,
                                                        log_prefix='Step_%d-' %
                                                        step)

                    list_sampling_time.append(time.time() -
                                              time_env_sampling_start)
                    all_paths.append(paths)
                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(
                        paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() -
                                                  time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='Step_%d-' % step)
                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() -
                                                time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)
                """ ------------------ Test-split Performance for logging ---------------------"""

                logger.log(
                    "Testing on test-tasks split for logging, rollout_per_task = 20..."
                )
                undiscounted_returns = []

                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("On Test: Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test20-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                logger.logkv('test20-AverageReturn', test_average_return)

                logger.log(
                    "Testing on test-tasks split for logging, rollout_per_task = 2..."
                )
                sampler_batch_size = self.sampler.batch_size
                self.sampler.update_batch_size(2)  ##############
                undiscounted_returns = []

                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("On Test: Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                self.sampler.update_batch_size(sampler_batch_size)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('test-AverageReturn', test_average_return)

                logger.logkv('Time-OuterStep',
                             time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps',
                             time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()
コード例 #14
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))
            n_timesteps = 0

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                gradients = []
                for i in range(self.num_sapling_rounds):
                    logger.log("\n ----- Sampling Round %d ---" % i)

                    dry = i < self.num_sapling_rounds - 1

                    if not dry: self.sampler.update_tasks()
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    all_samples_data, all_paths = [], []

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log('** Step ' + str(step) + ' **')

                        logger.log("Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=True, log_prefix='Step_%d-' % step)
                        all_paths.append(paths)

                        logger.log("Processing samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log='all', log_prefix='Step_%d-' % step)
                        all_samples_data.append(samples_data)

                        if not dry:
                            self.log_diagnostics(sum(list(paths.values()), []),
                                                 prefix='Step_%d-' % step)

                        if step < self.num_inner_grad_steps:
                            logger.log("Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                    """ compute gradients """
                    gradients.append(
                        self.algo.compute_gradients(all_samples_data))

                    if not dry:
                        """ ------------ Compute and log gradient variance ------------"""
                        # compute variance of adaptation gradients
                        for step_id in range(self.num_inner_grad_steps):
                            meta_batch_size = len(gradients[0][0])
                            grad_std, grad_rstd = [], []
                            for task_id in range(meta_batch_size):
                                stacked_grads = np.stack([
                                    gradients[round_id][step_id][task_id] for
                                    round_id in range(self.num_sapling_rounds)
                                ],
                                                         axis=1)
                                std = np.std(stacked_grads, axis=1)
                                mean = np.abs(np.mean(stacked_grads, axis=1))
                                grad_std.append(np.mean(std))
                                grad_rstd.append(np.mean(std / mean))

                            logger.logkv('Step_%i-GradientMean', np.mean(mean))
                            logger.logkv('Step_%i-GradientStd' % step_id,
                                         np.mean(grad_std))
                            logger.logkv('Step_%i-GradientRStd' % step_id,
                                         np.mean(grad_rstd))

                        # compute variance of meta gradients
                        stacked_grads = np.stack([
                            gradients[round_id][self.num_inner_grad_steps]
                            for round_id in range(self.num_sapling_rounds)
                        ],
                                                 axis=1)
                        std = np.std(stacked_grads, axis=1)
                        mean = np.abs(np.mean(stacked_grads, axis=1))

                        meta_grad_std = np.mean(std)
                        meta_grad_rstd = np.mean(std / (mean + 1e-8))
                        meta_grad_rvar = np.mean(std**2 / (mean + 1e-8))

                        logger.logkv('Meta-GradientMean', np.mean(mean))
                        logger.logkv('Meta-GradientStd', meta_grad_std)
                        logger.logkv('Meta-GradientRStd', meta_grad_rstd)
                        logger.logkv('Meta-GradientRVariance', meta_grad_rvar)

                        # compute cosine dists
                        cosine_dists = cdist(np.transpose(stacked_grads),
                                             np.transpose(
                                                 np.mean(stacked_grads,
                                                         axis=1).reshape(
                                                             (-1, 1))),
                                             metric='cosine')
                        mean_abs_cos_dist = np.mean(np.abs(cosine_dists))
                        mean_squared_cosine_dists = np.mean(cosine_dists**2)
                        mean_squared_cosine_dists_sqrt = np.sqrt(
                            mean_squared_cosine_dists)

                        logger.logkv('Meta-GradientCosAbs', mean_abs_cos_dist)
                        logger.logkv('Meta-GradientCosVar',
                                     mean_squared_cosine_dists)
                        logger.logkv('Meta-GradientCosStd',
                                     mean_squared_cosine_dists_sqrt)
                        """ ------------------ Outer Policy Update ---------------------"""

                        logger.log("Optimizing policy...")
                        # This needs to take all samples_data so that it can construct graph for meta-optimization.
                        self.algo.optimize_policy(all_samples_data)
                        """ ------------------- Logging Stuff --------------------------"""
                        n_timesteps += (self.num_inner_grad_steps +
                                        1) * self.sampler.total_samples
                        logger.logkv('n_timesteps', n_timesteps)

                        logger.log("Saving snapshot...")
                        params = self.get_itr_snapshot(itr)  # , **kwargs)
                        logger.save_itr_params(itr, params)
                        logger.log("Saved")

                        logger.logkv('Itr', itr)
                        logger.logkv('Time', time.time() - start_time)
                        logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()
コード例 #15
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                self.task = self.env.sample_tasks(self.sampler.meta_batch_size)
                self.sampler.set_tasks(self.task)
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")
                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                time_env_sampling_start = time.time()
                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='train-')
                sampling_time = time.time() - time_env_sampling_start
                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                time_proc_samples_start = time.time()
                samples_data = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='train-')
                proc_samples_time = time.time() - time_proc_samples_start

                self.log_diagnostics(sum(paths.values(), []), prefix='train-')
                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_optimization_step_start = time.time()
                self.algo.optimize_policy(samples_data)
                """ ------------------ Test-split Performance for logging ---------------------"""

                logger.log("Testing on test-tasks split for logging...")

                sampler_batch_size = self.sampler.batch_size
                self.sampler.update_batch_size(3)  ####################2

                undiscounted_returns = []
                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    # Caution: Here actually i in [0] since self.meta_batch_size=100(when running on linux)

                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    #self.policy.switch_to_pre_update()  # Switch to pre-update policy

                    logger.log("On Test: Obtaining samples...")
                    paths = self.sampler.obtain_samples(
                        log=False,
                        test=True)  # log_prefix='test-Step_%d-' % step

                    logger.log("On Test: Processing Samples...")
                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='test-')
                    """ ------------------- Logging Returns --------------------"""
                    paths = self.sample_processor.gao_paths(paths)
                    undiscounted_returns.extend(
                        [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                self.sampler.update_batch_size(sampler_batch_size)
                """ ------------------- Logging Stuff --------------------------"""

                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('test-AverageReturn', test_average_return)

                logger.logkv('Time-Optimization',
                             time.time() - time_optimization_step_start)
                logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
                logger.logkv('Time-Sampling', sampling_time)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
コード例 #16
0
    def optimize(self, input_val_dict):
        """
        Carries out the optimization step

        Args:
            inputs (list): inputs for the optimization
            extra_inputs (list): extra inputs for the optimization
            subsample_grouped_inputs (None or list): subsample data from each element of the list

        """
        logger.log("Start CG optimization")

        logger.log("computing loss before")
        loss_before = self.loss(input_val_dict)

        logger.log("performing update")

        logger.log("computing gradient")
        gradient = self.gradient(input_val_dict)
        logger.log("gradient computed")

        logger.log("computing descent direction")
        Hx = self._hvp_approach.build_eval(input_val_dict)
        descent_direction = conjugate_gradients(Hx,
                                                gradient,
                                                cg_iters=self._cg_iters)

        initial_step_size = np.sqrt(
            2.0 * self._max_constraint_val *
            (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8)))
        if np.isnan(initial_step_size):
            logger.log("Initial step size is NaN! Rejecting the step!")
            return

        initial_descent_step = initial_step_size * descent_direction
        logger.log("descent direction computed")

        prev_params = self._target.get_param_values()
        prev_params_values = _flatten_params(prev_params)

        loss, constraint_val, n_iter, violated = 0, 0, 0, False
        for n_iter, ratio in enumerate(self._backtrack_ratio**np.arange(
                self._max_backtracks)):
            cur_step = ratio * initial_descent_step
            cur_params_values = prev_params_values - cur_step
            cur_params = _unflatten_params(cur_params_values,
                                           params_example=prev_params)
            self._target.set_params(cur_params)

            loss, constraint_val = self.loss(
                input_val_dict), self.constraint_val(input_val_dict)
            if loss < loss_before and constraint_val <= self._max_constraint_val:
                break
        """ ------------------- Logging Stuff -------------------------- """
        if np.isnan(loss):
            violated = True
            logger.log("Line search violated because loss is NaN")
        if np.isnan(constraint_val):
            violated = True
            logger.log("Line search violated because constraint %s is NaN" %
                       self._constraint_name)
        if loss >= loss_before:
            violated = True
            logger.log("Line search violated because loss not improving")
        if constraint_val >= self._max_constraint_val:
            violated = True
            logger.log(
                "Line search violated because constraint %s is violated" %
                self._constraint_name)

        if violated and not self._accept_violation:
            logger.log("Line search condition violated. Rejecting the step!")
            self._target.set_params(prev_params)

        logger.log("backtrack iters: %d" % n_iter)
        logger.log("computing loss after")
        logger.log("optimization finished")
コード例 #17
0
ファイル: visualize_policy.py プロジェクト: Zber5/MMAML-rl
        policy=policy,
        inner_lr=params['inner_lr'],
        meta_batch_size=params['meta_batch_size'],
        num_inner_grad_steps=params['num_inner_grad_steps'],
        learning_rate=params['learning_rate'],
        num_ppo_steps=params['num_promp_steps'],
        clip_eps=params['clip_eps'],
        target_inner_step=params['target_inner_step'],
        init_inner_kl_penalty=params['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=params['adaptive_inner_kl_penalty'],
    )

    saver = tf.train.Saver()

    if args.restore_path is not None:
        logger.log('Restoring parameters from {}'.format(args.restore_path))
        saver.restore(sess, args.restore_path)
        logger.log('Restored')

    uninit_vars = [
        var for var in tf.global_variables()
        if not sess.run(tf.is_variable_initialized(var))
    ]
    sess.run(tf.variables_initializer(uninit_vars))

    wrapped_env = env
    while hasattr(wrapped_env, '_wrapped_env'):
        wrapped_env = wrapped_env._wrapped_env

    frame_skip = wrapped_env.frame_skip if hasattr(wrapped_env,
                                                   'frame_skip') else 1