class TestLinearFeatureBaseline(unittest.TestCase): def setUp(self): self.random_env = RandomEnv() self.random_policy = RandomPolicy(1, 1) self.meta_batch_size = 2 self.batch_size = 10 self.path_length = 100 self.linear = LinearFeatureBaseline() self.sampler = MAMLSampler(self.random_env, self.random_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=True) def testFit(self): paths = self.sampler.obtain_samples() for task in paths.values(): unfit_error = 0 for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) unfit_pred = self.linear.predict(path) unfit_error += sum([ np.square(pred - actual) for pred, actual in zip(unfit_pred, path['returns']) ]) self.linear.fit(task) fit_error = 0 for path in task: fit_pred = self.linear.predict(path) fit_error += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertTrue(fit_error < unfit_error) def testSerialize(self): paths = self.sampler.obtain_samples() for task in paths.values(): for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) self.linear.fit(task) fit_error_pre = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_pre += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) pkl = pickle.dumps(self.linear) self.linear = pickle.loads(pkl) fit_error_post = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_post += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertEqual(fit_error_pre, fit_error_post)
def setUp(self): self.random_env = RandomEnv() self.random_policy = RandomPolicy(1, 1) self.meta_batch_size = 2 self.batch_size = 10 self.path_length = 100 self.linear = LinearFeatureBaseline() self.sampler = MAMLSampler(self.random_env, self.random_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=True)
def setUp(self): self.env = env = MetaPointEnv() self.baseline = baseline = LinearFeatureBaseline() self.policy = policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=10, hidden_sizes=(16, 16), learn_std=True, hidden_nonlinearity=tf.tanh, output_nonlinearity=None, ) self.sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=2, meta_batch_size=10, max_path_length=50, parallel=False, ) self.sample_processor = MAMLSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1.0, normalize_adv=True, positive_adv=False, ) self.algo = PPOMAML( policy=policy, inner_lr=0.1, meta_batch_size=10, num_inner_grad_steps=2, learning_rate=1e-3, num_ppo_steps=5, num_minibatches=1, clip_eps=0.5, clip_outer=True, target_outer_step=0, target_inner_step=2e-2, init_outer_kl_penalty=0, init_inner_kl_penalty=1e-3, adaptive_outer_kl_penalty=False, adaptive_inner_kl_penalty=True, anneal_factor=1.0, )
def main(config): baseline = LinearFeatureBaseline() # env = normalize(HalfCheetahRandDirecEnv()) env = HopperRandParamsEnv(3.5) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPOMAML( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_ppo_steps'], num_minibatches=config['num_minibatches'], clip_eps=config['clip_eps'], clip_outer=config['clip_outer'], target_outer_step=config['target_outer_step'], target_inner_step=config['target_inner_step'], init_outer_kl_penalty=config['init_outer_kl_penalty'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_outer_kl_penalty=config['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], anneal_factor=config['anneal_factor'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config[ 'num_inner_grad_steps'], # This is repeated in MAMLPPO, it's confusing ) trainer.train()
def main(config): baseline = LinearFeatureBaseline() env = normalize(HalfCheetahRandDirecEnv()) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = TRPOMAML(policy=policy, step_size=config['step_size'], inner_type=config['inner_type'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], inner_lr=config['inner_lr']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config[ 'num_inner_grad_steps'], # This is repeated in MAMLPPO, it's confusing ) trainer.train()
def main(config): baseline = LinearFeatureBaseline() env = normalize(HopperRandParamsEnv()) obs_dim = np.prod(env.observation_space.shape) policy = GaussianMLPPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=5, ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO(policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) reward_baseline = LinearTimeBaseline() return_baseline = LinearFeatureBaseline() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), # Todo...? action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], envs_per_task=int(kwargs['rollouts_per_meta_task'] / 2)) sample_processor = DiceMAMLSampleProcessor( baseline=reward_baseline, max_path_length=kwargs['max_path_length'], discount=kwargs['discount'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], return_baseline=return_baseline) algo = VPG_DICEMAML(policy=policy, max_path_length=kwargs['max_path_length'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], inner_lr=kwargs['inner_lr'], learning_rate=kwargs['learning_rate']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], ) trainer.train()
args = parser.parse_args(sys.argv[1:]) sess = tf.InteractiveSession() policy = joblib.load(args.policy)['policy'] policy.switch_to_pre_update() baseline = LinearFeatureBaseline() env = normalize(AntRandGoalEnv()) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=BATCH_SIZE, meta_batch_size=META_BATCH_SIZE, max_path_length=PATH_LENGTH, parallel=True, envs_per_task=20, ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1, normalize_adv=True, positive_adv=False, ) # Doesn't matter which algo algo = VPGMAML(
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) sess = tf.Session() with sess.as_default() as sess: config = json.load(open(osp.join(kwargs['path'], 'params.json'), 'r')) data = joblib.load(osp.join(kwargs['path'], 'params.pkl')) policy = data['policy'] env = data['env'] baseline = data['baseline'] if kwargs['rollouts_per_meta_task'] is None: rollouts_per_meta_task = int( np.ceil(config['rollouts_per_meta_task'] / config['meta_batch_size'])) else: rollouts_per_meta_task = kwargs['rollouts_per_meta_task'] sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=rollouts_per_meta_task, meta_batch_size=config['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) sample_processor = SampleProcessor( baseline=baseline, discount=config['discount'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = VPG( policy=policy, learning_rate=config['inner_lr'], ) tester = Tester( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, task=None, ) tester.train()
class TestLikelihoodRation(unittest.TestCase): """ Assure that likelihhood ratio at first gradient step is approx. one since pi_old = pi_new """ def setUp(self): self.env = env = MetaPointEnv() self.baseline = baseline = LinearFeatureBaseline() self.policy = policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=10, hidden_sizes=(16, 16), learn_std=True, hidden_nonlinearity=tf.tanh, output_nonlinearity=None, ) self.sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=2, meta_batch_size=10, max_path_length=50, parallel=False, ) self.sample_processor = MAMLSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1.0, normalize_adv=True, positive_adv=False, ) self.algo = PPOMAML( policy=policy, inner_lr=0.1, meta_batch_size=10, num_inner_grad_steps=2, learning_rate=1e-3, num_ppo_steps=5, num_minibatches=1, clip_eps=0.5, clip_outer=True, target_outer_step=0, target_inner_step=2e-2, init_outer_kl_penalty=0, init_inner_kl_penalty=1e-3, adaptive_outer_kl_penalty=False, adaptive_inner_kl_penalty=True, anneal_factor=1.0, ) def test_likelihood_ratio(self): with tf.Session() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) self.sampler.update_tasks() self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data, all_paths = [], [] for step in range(1): """ -------------------- Sampling --------------------------""" paths = self.sampler.obtain_samples(log_prefix=str(step)) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" samples_data = self.sample_processor.process_samples(paths, log=False) all_samples_data.append(samples_data) """ ------------------- Inner Policy Update --------------------""" obs_phs, action_phs, adv_phs, dist_info_phs, all_phs = self.algo._make_input_placeholders( '') for i in range(self.algo.meta_batch_size): obs = samples_data[i]['observations'] actions = samples_data[i]['actions'] agent_infos = samples_data[i]['agent_infos'] param_vals = self.policy.get_param_values() likelihood_ratio_sym = self.policy.likelihood_ratio_sym( obs_phs[i], action_phs[i], dist_info_phs[i], self.policy.policies_params_phs[i]) feed_dict_params = dict( zip(self.policy.policies_params_phs[i].values(), param_vals.values())) feed_dict_dist_infos = dict( zip(dist_info_phs[i].values(), agent_infos.values())) feed_dict = {obs_phs[i]: obs, action_phs[i]: actions} feed_dict.update(feed_dict_params) feed_dict.update(feed_dict_dist_infos) lr = sess.run(likelihood_ratio_sym, feed_dict=feed_dict) self.assertTrue(np.allclose(lr, 1))