Example #1
0
 def setUp(self):
     self.test_env = TestEnv()
     self.random_env = RandomEnv()
     self.test_policy = TestPolicy(obs_dim=3, action_dim=4)
     self.return_policy = ReturnPolicy(obs_dim=3, action_dim=4)
     self.random_policy = RandomPolicy(obs_dim=3, action_dim=4)
     self.meta_batch_size = 3
     self.batch_size = 4
     self.path_length = 5
     self.it_sampler = MetaSampler(self.test_env, self.test_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=False)
     self.par_sampler = MetaSampler(self.test_env, self.test_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=True)
     self.sample_processor = SampleProcessor(baseline=LinearFeatureBaseline())
     self.Meta_sample_processor = MetaSampleProcessor(baseline=LinearFeatureBaseline())
Example #2
0
 def setUp(self):
     self.random_env = RandomEnv()
     self.random_policy = RandomPolicy(1, 1)
     self.meta_batch_size = 2
     self.batch_size = 10
     self.path_length = 100
     self.linear = LinearFeatureBaseline()
     self.sampler = MetaSampler(self.random_env,
                                self.random_policy,
                                self.batch_size,
                                self.meta_batch_size,
                                self.path_length,
                                parallel=True)
Example #3
0
class TestLinearFeatureBaseline(unittest.TestCase):
    def setUp(self):
        self.random_env = RandomEnv()
        self.random_policy = RandomPolicy(1, 1)
        self.meta_batch_size = 2
        self.batch_size = 10
        self.path_length = 100
        self.linear = LinearFeatureBaseline()
        self.sampler = MetaSampler(self.random_env,
                                   self.random_policy,
                                   self.batch_size,
                                   self.meta_batch_size,
                                   self.path_length,
                                   parallel=True)

    def testFit(self):
        paths = self.sampler.obtain_samples()
        for task in paths.values():
            unfit_error = 0
            for path in task:
                path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
                unfit_pred = self.linear.predict(path)
                unfit_error += sum([
                    np.square(pred - actual)
                    for pred, actual in zip(unfit_pred, path['returns'])
                ])
            self.linear.fit(task)
            fit_error = 0
            for path in task:
                fit_pred = self.linear.predict(path)
                fit_error += sum([
                    np.square(pred - actual)
                    for pred, actual in zip(fit_pred, path['returns'])
                ])
            self.assertTrue(fit_error < unfit_error)

    def testSerialize(self):
        paths = self.sampler.obtain_samples()
        for task in paths.values():
            for path in task:
                path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
            self.linear.fit(task)
            fit_error_pre = 0
            for path in task:
                fit_pred = self.linear.predict(path)
                fit_error_pre += sum([
                    np.square(pred - actual)
                    for pred, actual in zip(fit_pred, path['returns'])
                ])
            pkl = pickle.dumps(self.linear)
            self.linear = pickle.loads(pkl)
            fit_error_post = 0
            for path in task:
                fit_pred = self.linear.predict(path)
                fit_error_post += sum([
                    np.square(pred - actual)
                    for pred, actual in zip(fit_pred, path['returns'])
                ])
            self.assertEqual(fit_error_pre, fit_error_post)
Example #4
0
def main(config):
    set_seed(config['seed'])

    reward_baseline = LinearTimeBaseline()  # the usual baseline
    return_baseline = LinearFeatureBaseline(
    )  # the additional baseline for DICE

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    meta_baseline = MetaNNBaseline(
        input_size=env.observation_space.shape[0])  # the meta baseline

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = TMAMLMetaSampleProcessor(
        baseline=reward_baseline,
        max_path_length=config['max_path_length'],
        discount=config['discount'],
        normalize_adv=config['normalize_adv'],
        positive_adv=config['positive_adv'],
        return_baseline=return_baseline,
        metabaseline=meta_baseline,
    )

    algo = TMAML(policy=policy,
                 max_path_length=config['max_path_length'],
                 meta_batch_size=config['meta_batch_size'],
                 num_inner_grad_steps=config['num_inner_grad_steps'],
                 inner_lr=config['inner_lr'],
                 learning_rate=config['learning_rate'])

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
Example #5
0
def main(config):

    baseline = LinearFeatureBaseline()
    #env = rl2env(HalfCheetahRandDirecEnv())
    env = rl2env(globals()[config['env']]())  # instantiate env
    obs_dim = np.prod(env.observation_space.shape) + np.prod(env.action_space.shape) + 1 + 1

    policy = GaussianRNNPolicy(
            name="meta-policy",
            obs_dim=obs_dim,
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=config['meta_batch_size'],
            hidden_sizes=config['hidden_sizes'],
            cell_type=config['cell_type']
    )

    sampler = MAMLSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
        envs_per_task=1,
    )

    sample_processor = RL2SampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
        positive_adv=config['positive_adv'],
    )

    algo = PPO(
        policy=policy,
        learning_rate=config['learning_rate'],
        max_epochs=config['max_epochs']
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
    )
    trainer.train()
Example #6
0
    def setUp(self):
        self.env = env = MetaPointEnv()

        self.baseline = baseline = LinearFeatureBaseline()

        self.policy = policy = MetaGaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=10,
            hidden_sizes=(16, 16),
            learn_std=True,
            hidden_nonlinearity=tf.tanh,
            output_nonlinearity=None,
        )

        self.sampler = MetaSampler(
            env=env,
            policy=policy,
            rollouts_per_meta_task=2,
            meta_batch_size=10,
            max_path_length=50,
            parallel=False,
        )

        self.sample_processor = MetaSampleProcessor(
            baseline=baseline,
            discount=0.99,
            gae_lambda=1.0,
            normalize_adv=True,
            positive_adv=False,
        )

        self.algo = ProMP(
            policy=policy,
            inner_lr=0.1,
            meta_batch_size=10,
            num_inner_grad_steps=2,
            learning_rate=1e-3,
            num_ppo_steps=5,
            num_minibatches=1,
            clip_eps=0.5,
            target_inner_step=2e-2,
            init_inner_kl_penalty=1e-3,
        )
Example #7
0
    def test_process_samples_advantages2(self):
        for normalize_adv in [True, False]:
            for paths in [self.paths, self.paths_rand]:
                return_baseline = LinearFeatureBaseline()
                dice_sample_processor = DiceSampleProcessor(self.baseline, max_path_length=6, gae_lambda=1.0,
                                                            discount=0.97, normalize_adv=normalize_adv, return_baseline=return_baseline)
                dice_samples_data = dice_sample_processor.process_samples(paths[0])
                mask = dice_samples_data['mask']

                # reshape data and filter out masked items:

                sample_processor = SampleProcessor(return_baseline, gae_lambda=1.0, discount=0.97, normalize_adv=normalize_adv)
                samples_data = sample_processor.process_samples(paths[0])

                self.assertAlmostEqual(np.sum(mask[:,:, None]*dice_samples_data['observations']), np.sum(samples_data['observations']))
                self.assertAlmostEqual(np.sum(mask[:, :, None] * dice_samples_data['actions']),
                                       np.sum(samples_data['actions']))
                self.assertAlmostEqual(np.sum(mask * dice_samples_data['advantages']),
                                      np.sum(samples_data['advantages']), places=2)
                self.assertAlmostEqual(np.sum(mask * dice_samples_data['rewards']),
                                      np.sum(samples_data['rewards']))
Example #8
0
 def test_process_samples_advantages1(self):
     return_baseline = LinearFeatureBaseline()
     sample_processor = DiceSampleProcessor(self.baseline, max_path_length=6, return_baseline=return_baseline)
     samples_data = sample_processor.process_samples(self.paths[0])
     self.assertAlmostEqual(samples_data['advantages'].shape, (self.batch_size, 6))
     self.assertAlmostEqual(samples_data['advantages'].ndim, 2)
Example #9
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap',
                     snapshot_gap=50)
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    # Instantiate classes
    set_seed(kwargs['seed'])

    reward_baseline = LinearTimeBaseline()
    return_baseline = LinearFeatureBaseline()

    env = normalize(kwargs['env']())  # Wrappers?

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),  # Todo...?
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=kwargs['meta_batch_size'],
        hidden_sizes=kwargs['hidden_sizes'],
        learn_std=kwargs['learn_std'],
        hidden_nonlinearity=kwargs['hidden_nonlinearity'],
        output_nonlinearity=kwargs['output_nonlinearity'],
    )

    # Load policy here

    sampler = MAMLSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=kwargs['rollouts_per_meta_task'],
        meta_batch_size=kwargs['meta_batch_size'],
        max_path_length=kwargs['max_path_length'],
        parallel=kwargs['parallel'],
        envs_per_task=int(kwargs['rollouts_per_meta_task'] / 2))

    sample_processor = DiceMAMLSampleProcessor(
        baseline=reward_baseline,
        max_path_length=kwargs['max_path_length'],
        discount=kwargs['discount'],
        normalize_adv=kwargs['normalize_adv'],
        positive_adv=kwargs['positive_adv'],
        return_baseline=return_baseline)

    algo = VPG_DICEMAML(policy=policy,
                        max_path_length=kwargs['max_path_length'],
                        meta_batch_size=kwargs['meta_batch_size'],
                        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
                        inner_lr=kwargs['inner_lr'],
                        learning_rate=kwargs['learning_rate'])

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=kwargs['n_itr'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
    )

    trainer.train()
Example #10
0
                        type=json.loads,
                        default={},
                        help='accepts json for overriding training parameters')
    parser.add_argument('--video_filename', default=None)
    parser.add_argument('--num_trajs', type=int, default=10)
    args = parser.parse_args(sys.argv[1:])

    params_path = os.path.join(
        os.path.split(args.restore_path)[0], 'params.json')

    with open(params_path, 'r') as f:
        params = json.load(f)

    params.update(args.overrides)

    baseline = LinearFeatureBaseline()

    env = globals()[params['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=gpu_config)

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=params['meta_batch_size'],
        hidden_sizes=params['hidden_sizes'],
        cell_size=params['cell_size'],