def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e5)) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def test_rl2_trpo_pendulum(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = RL2TRPO( meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def test_set_plot(self): deterministic.set_seed(1) with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=100, plot=True) assert isinstance(trainer._plotter, Plotter), ( 'self.plotter in TFTrainer should be set to Plotter.')
def test_te_ppo(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length, is_tf_worker=True, worker_class=TaskEmbeddingWorker) algo = TEPPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, inference=self.inference, sampler=sampler, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=self.policy_ent_coeff, encoder_ent_coeff=self.encoder_ent_coeff, inference_ce_coeff=self.inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, self.env) trainer.train(n_epochs=1, batch_size=self.batch_size, plot=False)
def test_make_sampler_ray_sampler(self, ray_session_fixture): del ray_session_fixture assert ray.is_initialized() with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) assert isinstance(trainer._sampler, RaySampler) trainer.train(n_epochs=1, batch_size=10)
def test_train(self): with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=100)
def trpo_cartpole(ctxt=None, seed=1): """Train TRPO with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000)
def test_gaussian_policies(self, policy_cls): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('Pendulum-v0')) policy = policy_cls(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5)), ) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=4000) env.close()
def train_trpo(ctxt=None): set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedTRPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, ) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def fixture_exp(snapshot_config, sess): """Dummy fixture experiment function. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by Trainer to create the snapshotter. If None, it will create one with default settings. sess (tf.Session): An optional TensorFlow session. A new session will be created immediately if not provided. Returns: np.ndarray: Values of the parameters evaluated in the current session """ with TFTrainer(snapshot_config=snapshot_config, sess=sess) as trainer: env = GymEnv('CartPole-v1', max_episode_length=100) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 20 sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CMAES(env_spec=env.spec, policy=policy, sampler=sampler, n_samples=n_samples) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=1000) # No assertion on return because CMAES is not stable. env.close()
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('FetchReach-v1') policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) # pylint: disable=no-member replay_buffer = HERReplayBuffer(capacity_in_transitions=int(1e6), replay_k=4, reward_fn=env.compute_reward, env_spec=env.spec) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.01, steps_per_epoch=50, n_train_steps=40, discount=0.95, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=50, batch_size=256)
def continuous_mlp_q_function(ctxt, env_id, seed): """Create Continuous MLP QFunction on TF-DDPG. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise( env.spec, policy, sigma=hyper_params['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = PathBuffer( capacity_in_transitions=hyper_params['replay_buffer_size']) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=hyper_params['steps_per_epoch'], policy_lr=hyper_params['policy_lr'], qf_lr=hyper_params['qf_lr'], target_update_tau=hyper_params['tau'], n_train_steps=hyper_params['n_train_steps'], discount=hyper_params['discount'], min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(ddpg, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_exploration_steps'])
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with TFTrainer(snapshot_config) as trainer: n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, target_update_tau=0.005, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 200
def her_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow HER model and training. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise( env_spec=env.spec, policy=policy, sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HERReplayBuffer( env_spec=env.spec, capacity_in_transitions=hyper_parameters['replay_buffer_size'], replay_k=4, reward_fn=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_exploration_steps'])
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000, max_episode_length=100): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. max_episode_length (int): Max number of timesteps in an episode. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize( GymEnv('MemorizeDigits-v0', is_image=True, max_episode_length=max_episode_length)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, )) # yapf: disable baseline = GaussianCNNBaseline( env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True) # yapf: disable algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=1000, batch_size=batch_size)
def gaussian_lstm_policy(ctxt, env_id, seed): """Create Gaussian LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=5, batch_size=2048)
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with TFTrainer(snapshot_config, sess=self.sess) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) sampler = LocalSampler( agents=epilson_greedy_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, sampler=sampler, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/mlp/hidden_0', reuse=True): bias = tf.compat.v1.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.compat.v1.Session(graph=tf.Graph()): pickle.loads(h) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/mlp/hidden_0', reuse=True): new_bias = tf.compat.v1.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=2048, plot=False)
def ppo_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = TF_GMB( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=3e-4, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TF_PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_optimization_epochs=10, learning_rate=3e-4, verbose=True)) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def rl2_trpo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train TRPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, episodes_per_trial=episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def gaussian_cnn_baseline(ctxt, env_id, seed): """Create Gaussian CNN Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region']) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=params['n_epochs'], batch_size=params['batch_size'])
def ddpg_pendulum(ctxt=None, seed=1): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=500, batch_size=100)
def resume_experiment(ctxt, saved_dir): """Resume a Tensorflow experiment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. saved_dir (str): Path where snapshots are saved. """ with TFTrainer(snapshot_config=ctxt) as trainer: trainer.restore(from_dir=saved_dir) trainer.resume()
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = NPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20
def test_rl2_trpo_pendulum_invalid_kl_constraint(self): with TFTrainer(snapshot_config, sess=self.sess): with pytest.raises(ValueError): RL2TRPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, kl_constraint='xyz', episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01)
def ppo_car(ctxt=None, specs=None): mem_history = [] assert specs is not None set_seed(1) tf.keras.backend.clear_session() with TFTrainer(snapshot_config=ctxt) as trainer: #env = normalize(GymEnv("LunarLanderContinuous-v2")) env = normalize(CarEnv(specs), normalize_obs=True) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=500, is_tf_worker=True) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.07, optimizer_args=dict( batch_size=128, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) trainer.train(n_epochs=300, batch_size=2048, plot=False) trainer.save()
def test_trpo_soft_kl_constraint(self): """Test TRPO with unkown KL constraints.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = TRPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, kl_constraint='soft') trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 45
def test_rl2_trpo_pendulum_default_optimizer(self): with TFTrainer(snapshot_config, sess=self.sess): algo = RL2TRPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, kl_constraint='hard', episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01) assert isinstance(algo._inner_algo._optimizer, ConjugateGradientOptimizer)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_optimization_epochs=10)) trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35