def test_clone(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec gmb = GaussianMLPBaseline(env_spec=box_env_spec) cloned_gmb_model = gmb.clone_model(name='cloned_model') for cloned_param, param in zip(cloned_gmb_model.parameters.values(), gmb.parameters.values()): assert np.array_equal(cloned_param, param)
def test_fit_unnormalized(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec gmb = GaussianMLPBaseline(env_spec=box_env_spec, subsample_factor=0.9, normalize_inputs=False, normalize_outputs=False) train_paths, _, _, paths, expected = get_train_test_data() for _ in range(150): gmb.fit(train_paths) prediction = gmb.predict(paths) assert np.allclose(prediction, expected, rtol=0, atol=0.1) x_mean = self.sess.run(gmb._networks['default'].x_mean) x_mean_expected = np.zeros_like(x_mean) x_std = self.sess.run(gmb._networks['default'].x_std) x_std_expected = np.ones_like(x_std) assert np.array_equal(x_mean, x_mean_expected) assert np.array_equal(x_std, x_std_expected) y_mean = self.sess.run(gmb._networks['default'].y_mean) y_mean_expected = np.zeros_like(y_mean) y_std = self.sess.run(gmb._networks['default'].y_std) y_std_expected = np.ones_like(y_std) assert np.allclose(y_mean, y_mean_expected) assert np.allclose(y_std, y_std_expected)
def test_fit_normalized(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec gmb = GaussianMLPBaseline(env_spec=box_env_spec) (train_paths, observations, returns, paths, expected) = get_train_test_data() for _ in range(150): gmb.fit(train_paths) prediction = gmb.predict(paths) assert np.allclose(prediction, expected, rtol=0, atol=0.1) x_mean = self.sess.run(gmb._networks['default'].x_mean) x_mean_expected = np.mean(observations, axis=0, keepdims=True) x_std = self.sess.run(gmb._networks['default'].x_std) x_std_expected = np.std(observations, axis=0, keepdims=True) assert np.allclose(x_mean, x_mean_expected) assert np.allclose(x_std, x_std_expected) y_mean = self.sess.run(gmb._networks['default'].y_mean) y_mean_expected = np.mean(returns, axis=0, keepdims=True) y_std = self.sess.run(gmb._networks['default'].y_std) y_std_expected = np.std(returns, axis=0, keepdims=True) assert np.allclose(y_mean, y_mean_expected) assert np.allclose(y_std, y_std_expected)
def test_unflattened_input(self): env = GymEnv(DummyBoxEnv(obs_dim=(2, 2))) gmb = GaussianMLPBaseline(env_spec=env.spec) env.reset() es = env.step(1) obs, rewards = es.observation, es.reward train_paths = [{'observations': [obs], 'returns': [rewards]}] gmb.fit(train_paths) paths = {'observations': [obs]} prediction = gmb.predict(paths) assert np.allclose(0., prediction)
def test_baseline(self): """Test the baseline initialization.""" box_env = GarageEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) self.sess.run(tf.compat.v1.global_variables_initializer()) deterministic_mlp_baseline.get_param_values() gaussian_mlp_baseline.get_param_values() box_env.close()
def test_get_params_internal(self, obs_dim): box_env = TfEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('garage.tf.baselines.' 'gaussian_mlp_baseline.' 'GaussianMLPRegressor'), new=SimpleGaussianMLPRegressor): gmb = GaussianMLPBaseline(env_spec=box_env.spec, regressor_args=dict()) params_interal = gmb.get_params_internal() trainable_params = tf.compat.v1.trainable_variables( scope='GaussianMLPBaseline') assert np.array_equal(params_interal, trainable_params)
def test_fit_smaller_subsample_factor(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec gmb = GaussianMLPBaseline(env_spec=box_env_spec, subsample_factor=0.9) train_paths, _, _, paths, expected = get_train_test_data() for _ in range(150): gmb.fit(train_paths) prediction = gmb.predict(paths) assert np.allclose(prediction, expected, rtol=0, atol=0.1)
def test_fit_without_trusted_region(self): box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec gmb = GaussianMLPBaseline(env_spec=box_env_spec, use_trust_region=False) train_paths, _, _, paths, expected = get_train_test_data() for _ in range(150): gmb.fit(train_paths) prediction = gmb.predict(paths) assert np.allclose(prediction, expected, rtol=0, atol=0.1)
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40 env.close()
def run_task(*_): env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = InstrumentedTRPO( env=env, policy=policy, baseline=baseline, batch_size=1024, max_path_length=100, n_itr=4, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=True, ) algo.train() env.close()
def run_task(v): v = SimpleNamespace(**v) with LocalRunner() as runner: # Environment env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=v.max_path_length, discount=0.99, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=v.batch_size, plot=False)
def trpo_minigrid(ctxt=None, seed=1): """Train TRPO with MiniGrid-FourRooms-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(env_name='DisabledAntPyBulletEnv-v0') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(128, 64, 32)) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.001) runner.setup(algo, env) runner.train(n_epochs=2000, batch_size=4000)
def run_task(*_): sess = tf.Session() sess.__enter__() snapshot = joblib.load(latent_policy_pkl) latent_policy = snapshot["policy"] inner_env = SimpleReacherEnv(goal_position=(0.65, 0.3, 0.3), control_method="position_control", completion_bonus=30) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy( name="policy", env_spec=env, hidden_sizes=(64, 64), init_std=20, # std_share_network=False, # adaptive_std=True ) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=100, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, ) algo.train(sess=sess)
def run_task(*_): sess = tf.Session() sess.__enter__() latent_policy = joblib.load(latent_policy_pkl)["policy"] with LocalRunner(sess=sess) as runner: inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy(name="composer", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, use_mpc_es=True, ) runner.setup(algo, env) runner.train(n_epochs=600, plot=False, batch_size=1024)
def run_task(*_): with LocalRunner() as runner: env = PointEnv(goal=(3, 3), random_start=True) env = TfEnv(env) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, use_mpc_es=True, ) runner.setup(algo, env) runner.train(n_epochs=1500, batch_size=1024, plot=True)
def test_ppo_pendulum_lstm(self): """Test PPO with Pendulum environment and recurrent policy.""" with TFTrainer(snapshot_config) as trainer: env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) lstm_policy = GaussianLSTMPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) algo = PPO( env_spec=env.spec, policy=lstm_policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 60
def test_ppo_pendulum_recurrent(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalRunner() as runner: logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 40 env.close()
def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 50 env.close()
def run_task(*_): env = TfEnv( normalize( GridworldGathererEnv( plot={ 'visitation': { # 'save': '~/garage/data/local/gridworld/instant-run', 'save': False, 'live': True } }))) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=100, n_itr=50, discount=0.99, step_size=0.01, ) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as session: algo.train(sess=session)
def test_process_samples_continuous_recurrent(self): env = TfEnv(DummyBoxEnv()) policy = GaussianLSTMPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyBoxEnv # never terminate until it reaches max_path_length, batch size # must be max_path_length, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.flat_dim) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is only 1 path assert samples['lengths'].shape == (1, ) for key, shape in policy.state_info_specs: assert samples['agent_infos'][key].shape == (max_path_length, np.prod(shape)) # DummyBoxEnv has env_info dummy assert samples['env_infos']['dummy'].shape == (max_path_length, ) assert isinstance(samples['average_return'], float)
def test_ppo_pendulum_gru_with_model(self): """Test PPO with Pendulum environment and GRU policy.""" with LocalRunner(sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianGRUPolicyWithModel(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close()
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 30 env.close()
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = NPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20 env.close()
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=1000, discount=0.99, step_size=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def test_trpo_unknown_kl_constraint(self): """Test TRPO with unkown KL constraints.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) with self.assertRaises(NotImplementedError) as context: TRPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, kl_constraint="random kl_constraint", ) assert "Unknown KLConstraint" in str(context.exception) env.close()
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" logger._tensorboard = TensorBoardOutput() env = TfEnv(normalize(gym.make("Pendulum-v0"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > -1000
def run_task(*_): """ Wrap PPO training task in the run_task function. :param _: :return: """ env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, step_size=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False) algo.train()
def gaussian_lstm_policy(ctxt, env_id, seed): """Create Gaussian LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=5, batch_size=2048)
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the trpo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return:import baselines.common.tf_util as U """ ext.set_seed(seed) with tf.Graph().as_default(): env = TfEnv(normalize(env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(32, 32), use_trust_region=True, ), ) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1024, max_path_length=100, n_itr=976, discount=0.99, gae_lambda=0.98, clip_range=0.1, policy_ent_coeff=0.0, plot=False, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(log_dir) algo.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file