Ejemplo n.º 1
0
class TestLikelihoodRation(unittest.TestCase):
    """
    Assure that likelihhood ratio at first gradient step is approx. one since pi_old = pi_new
    """
    def setUp(self):
        self.env = env = MetaPointEnv()

        self.baseline = baseline = LinearFeatureBaseline()

        self.policy = policy = MetaGaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=10,
            hidden_sizes=(16, 16),
            learn_std=True,
            hidden_nonlinearity=tf.tanh,
            output_nonlinearity=None,
        )

        self.sampler = MetaSampler(
            env=env,
            policy=policy,
            rollouts_per_meta_task=2,
            meta_batch_size=10,
            max_path_length=50,
            parallel=False,
        )

        self.sample_processor = MetaSampleProcessor(
            baseline=baseline,
            discount=0.99,
            gae_lambda=1.0,
            normalize_adv=True,
            positive_adv=False,
        )

        self.algo = ProMP(
            policy=policy,
            inner_lr=0.1,
            meta_batch_size=10,
            num_inner_grad_steps=2,
            learning_rate=1e-3,
            num_ppo_steps=5,
            num_minibatches=1,
            clip_eps=0.5,
            target_inner_step=2e-2,
            init_inner_kl_penalty=1e-3,
        )

    def test_likelihood_ratio(self):
        with tf.Session() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            self.sampler.update_tasks()
            self.policy.switch_to_pre_update()  # Switch to pre-update policy

            all_samples_data, all_paths = [], []
            for step in range(1):
                """ -------------------- Sampling --------------------------"""
                paths = self.sampler.obtain_samples(log_prefix=str(step))
                all_paths.append(paths)
                """ ----------------- Processing Samples ---------------------"""
                samples_data = self.sample_processor.process_samples(paths,
                                                                     log=False)
                all_samples_data.append(samples_data)
                """ ------------------- Inner Policy Update --------------------"""
                obs_phs, action_phs, adv_phs, dist_info_phs, all_phs = self.algo._make_input_placeholders(
                    '')

                for i in range(self.algo.meta_batch_size):
                    obs = samples_data[i]['observations']
                    actions = samples_data[i]['actions']
                    agent_infos = samples_data[i]['agent_infos']
                    param_vals = self.policy.get_param_values()

                    likelihood_ratio_sym = self.policy.likelihood_ratio_sym(
                        obs_phs[i], action_phs[i], dist_info_phs[i],
                        self.policy.policies_params_phs[i])

                    feed_dict_params = dict(
                        zip(self.policy.policies_params_phs[i].values(),
                            param_vals.values()))

                    feed_dict_dist_infos = dict(
                        zip(dist_info_phs[i].values(), agent_infos.values()))

                    feed_dict = {obs_phs[i]: obs, action_phs[i]: actions}

                    feed_dict.update(feed_dict_params)
                    feed_dict.update(feed_dict_dist_infos)

                    lr = sess.run(likelihood_ratio_sym, feed_dict=feed_dict)

                    self.assertTrue(np.allclose(lr, 1))
Ejemplo n.º 2
0
        'dt'), 'environment must have dt attribute that specifies the timestep'
    timestep = wrapped_env.dt
    speedup = 1

    with sess.as_default() as sess:
        policy.switch_to_pre_update()

        # Preupdate:
        tasks = env.sample_tasks(params['meta_batch_size'])
        sampler.vec_env.set_tasks(tasks)

        # Preupdate:
        for i in range(params['num_inner_grad_steps']):
            paths = sampler.obtain_samples(log=False)
            samples_data = sample_processor.process_samples(paths,
                                                            log=True,
                                                            log_prefix='%i_' %
                                                            i)
            env.log_diagnostics(sum(list(paths.values()), []),
                                prefix='%i_' % i)
            algo._adapt(samples_data)

        paths = sampler.obtain_samples(log=False)
        samples_data = sample_processor.process_samples(
            paths, log=True, log_prefix='%i_' % params['num_inner_grad_steps'])
        env.log_diagnostics(sum(list(paths.values()), []),
                            prefix='%i_' % params['num_inner_grad_steps'])
        logger.dumpkvs()
        images = []

        # Postupdate:
        for _ in range(args.num_trajs):