def test_get_params(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec cmb = ContinuousMLPBaseline(env_spec=box_env_spec) params_internal = cmb.get_params() trainable_params = tf.compat.v1.trainable_variables( scope='ContinuousMLPBaseline') assert np.array_equal(params_internal, trainable_params)
def test_get_params_internal(self, obs_dim): box_env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('garage.tf.baselines.' 'continuous_mlp_baseline.' 'ContinuousMLPRegressor'), new=SimpleMLPRegressor): cmb = ContinuousMLPBaseline(env_spec=box_env.spec) params_interal = cmb.get_params_internal() trainable_params = tf.compat.v1.trainable_variables( scope='ContinuousMLPBaseline') assert np.array_equal(params_interal, trainable_params)
def test_baseline(self): """Test the baseline initialization.""" box_env = GarageEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) self.sess.run(tf.compat.v1.global_variables_initializer()) deterministic_mlp_baseline.get_param_values() gaussian_mlp_baseline.get_param_values() box_env.close()
def test_unflattened_input(self): env = GymEnv(DummyBoxEnv(obs_dim=(2, 2))) cmb = ContinuousMLPBaseline(env_spec=env.spec) env.reset() es = env.step(1) obs, rewards = es.observation, es.reward train_paths = [{'observations': [obs], 'returns': [rewards]}] cmb.fit(train_paths) paths = {'observations': [obs]} prediction = cmb.predict(paths) assert np.allclose(0., prediction)
def ppo_cmb(env, seed, log_dir): """Create test continuous mlp baseline on ppo. Args: env (gym_env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: training results in csv format. """ deterministic.set_seed(seed) config = tf.compat.v1.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_proc, inter_op_parallelism_threads=num_proc) sess = tf.compat.v1.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=num_proc) as runner: env = TfEnv(normalize(env)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=policy_params['policy_hidden_sizes'], hidden_nonlinearity=policy_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=baseline_params['regressor_args'], ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=algo_params['max_path_length'], discount=algo_params['discount'], gae_lambda=algo_params['gae_lambda'], lr_clip_range=algo_params['lr_clip_range'], entropy_method=algo_params['entropy_method'], policy_ent_coeff=algo_params['policy_ent_coeff'], optimizer_args=algo_params['optimizer_args'], center_adv=algo_params['center_adv'], stop_entropy_gradient=True) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=algo_params['n_envs'])) runner.train(n_epochs=algo_params['n_epochs'], batch_size=algo_params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_ppo_pendulum_recurrent_continuous_baseline(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def test_is_pickleable(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec cmb = ContinuousMLPBaseline(env_spec=box_env_spec) with tf.compat.v1.variable_scope('ContinuousMLPBaseline', reuse=True): bias = tf.compat.v1.get_variable('mlp/hidden_0/bias') bias.load(tf.ones_like(bias).eval()) _, _, paths, _ = get_train_test_data() result1 = cmb.predict(paths) h = pickle.dumps(cmb) with tf.compat.v1.Session(graph=tf.Graph()): cmb_pickled = pickle.loads(h) result2 = cmb_pickled.predict(paths) assert np.array_equal(result1, result2)
def test_fit(self, obs_dim): box_env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('garage.tf.baselines.' 'continuous_mlp_baseline.' 'ContinuousMLPRegressor'), new=SimpleMLPRegressor): cmb = ContinuousMLPBaseline(env_spec=box_env.spec) paths = [{ 'observations': [np.full(obs_dim, 1)], 'returns': [1] }, { 'observations': [np.full(obs_dim, 2)], 'returns': [2] }] cmb.fit(paths) obs = {'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)]} prediction = cmb.predict(obs) assert np.array_equal(prediction, [1, 2])
def test_fit_unnormalized(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec cmb = ContinuousMLPBaseline(env_spec=box_env_spec, normalize_inputs=False) train_paths, _, paths, expected = get_train_test_data() for _ in range(20): cmb.fit(train_paths) prediction = cmb.predict(paths) assert np.allclose(prediction, expected, rtol=0, atol=0.1) x_mean = self.sess.run(cmb._x_mean) x_mean_expected = np.zeros_like(x_mean) x_std = self.sess.run(cmb._x_std) x_std_expected = np.ones_like(x_std) assert np.allclose(x_mean, x_mean_expected) assert np.allclose(x_std, x_std_expected)
def test_fit_normalized(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec cmb = ContinuousMLPBaseline(env_spec=box_env_spec) train_paths, observations, paths, expected = get_train_test_data() for _ in range(20): cmb.fit(train_paths) prediction = cmb.predict(paths) assert np.allclose(prediction, expected, rtol=0, atol=0.1) x_mean = self.sess.run(cmb._x_mean) x_mean_expected = np.mean(observations, axis=0, keepdims=True) x_std = self.sess.run(cmb._x_std) x_std_expected = np.std(observations, axis=0, keepdims=True) assert np.allclose(x_mean, x_mean_expected) assert np.allclose(x_std, x_std_expected)
def test_param_values(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec cmb = ContinuousMLPBaseline(env_spec=box_env_spec) new_cmb = ContinuousMLPBaseline(env_spec=box_env_spec, name='ContinuousMLPBaseline2') # Manual change the parameter of ContinuousMLPBaseline with tf.compat.v1.variable_scope('ContinuousMLPBaseline', reuse=True): bias = tf.compat.v1.get_variable('mlp/hidden_0/bias') bias.load(tf.ones_like(bias).eval()) old_param_values = cmb.get_param_values() new_param_values = new_cmb.get_param_values() assert not np.array_equal(old_param_values, new_param_values) new_cmb.set_param_values(old_param_values) new_param_values = new_cmb.get_param_values() assert np.array_equal(old_param_values, new_param_values)
def test_baseline(self): """Test the baseline initialization.""" box_env = TfEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) discrete_env = TfEnv(Resize(DummyDiscrete2DEnv(), width=64, height=64)) gaussian_conv_baseline = GaussianConvBaseline( env_spec=discrete_env, regressor_args=dict( conv_filters=[32, 32], conv_filter_sizes=[1, 1], conv_strides=[1, 1], conv_pads=['VALID', 'VALID'], hidden_sizes=(32, 32))) self.sess.run(tf.global_variables_initializer()) deterministic_mlp_baseline.get_param_values(trainable=True) gaussian_mlp_baseline.get_param_values(trainable=True) gaussian_conv_baseline.get_param_values(trainable=True) box_env.close()
def test_is_pickleable(self): box_env = GarageEnv(DummyBoxEnv(obs_dim=(1, ))) with mock.patch(('garage.tf.baselines.' 'continuous_mlp_baseline.' 'ContinuousMLPRegressor'), new=SimpleMLPRegressor): cmb = ContinuousMLPBaseline(env_spec=box_env.spec) obs = {'observations': [np.full(1, 1), np.full(1, 1)]} with tf.compat.v1.variable_scope('ContinuousMLPBaseline', reuse=True): return_var = tf.compat.v1.get_variable('SimpleMLPModel/return_var') return_var.load(1.0) prediction = cmb.predict(obs) h = pickle.dumps(cmb) with tf.compat.v1.Session(graph=tf.Graph()): cmb_pickled = pickle.loads(h) prediction2 = cmb_pickled.predict(obs) assert np.array_equal(prediction, prediction2)
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) trainer.setup(algo, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_exploration_steps'])
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_params['max_path_length'], discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) runner.setup(algo, env, sampler_args=dict(n_envs=hyper_params['n_envs'])) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def test_param_values(self, obs_dim): box_env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('garage.tf.baselines.' 'continuous_mlp_baseline.' 'ContinuousMLPRegressor'), new=SimpleMLPRegressor): cmb = ContinuousMLPBaseline(env_spec=box_env.spec) new_cmb = ContinuousMLPBaseline(env_spec=box_env.spec, name='ContinuousMLPBaseline2') # Manual change the parameter of ContinuousMLPBaseline with tf.compat.v1.variable_scope('ContinuousMLPBaseline2', reuse=True): return_var = tf.compat.v1.get_variable('SimpleMLPModel/return_var') return_var.load(1.0) old_param_values = cmb.get_param_values() new_param_values = new_cmb.get_param_values() assert not np.array_equal(old_param_values, new_param_values) new_cmb.set_param_values(old_param_values) new_param_values = new_cmb.get_param_values() assert np.array_equal(old_param_values, new_param_values)
def test_ppo_pendulum_continuous_baseline(self): """Test PPO with Pendulum environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()