def test_trpo_lstm_cartpole(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) snapshotter.snapshot_dir = './' runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close()
def test_gaussian_policies(self, policy_cls): with LocalTFRunner(sess=self.sess) as runner: env = TfEnv(normalize(gym.make('Pendulum-v0'))) policy = policy_cls(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=4000) env.close()
def test_trpo_gru_cartpole(self): deterministic.set_seed(2) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GarageEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 64 env.close()
def run_task(snapshot_config, *_): """Run task.""" with LocalRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalLSTMPolicyWithModel( name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def test_categorical_policies(self, policy_cls): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CartPole-v0', max_episode_length=100)) policy = policy_cls(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5)), ) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=1, batch_size=4000) env.close()
def trpo_cartpole_bullet(ctxt=None, seed=1): """Train TRPO with Pybullet's CartPoleBulletEnv environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = BulletEnv( gym.make('CartPoleBulletEnv-v1', renders=False, discrete_actions=True)) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000)
def trpo_swimmer_ray_sampler(ctxt=None, seed=1): """tf_trpo_swimmer. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_dashboard=False) with TFTrainer(snapshot_config=ctxt) as trainer: set_seed(seed) env = GymEnv('Swimmer-v2') policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env, sampler_cls=RaySampler, sampler_args={'seed': seed}) trainer.train(n_epochs=40, batch_size=4000)
def test_snapshot(self): with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=self.verifyItrs, batch_size=4000) env.close() # Read snapshot from self.log_dir # Test the presence and integrity of policy and env for i in range(0, self.verifyItrs): self.reset_tf() with LocalRunner(): snapshot = joblib.load( osp.join(self.log_dir.name, 'itr_{}.pkl'.format(i))) env = snapshot['env'] algo = snapshot['algo'] assert env assert algo assert algo.policy rollout(env, algo.policy, animated=False)
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def run_task(snapshot_config, *_): with LocalRunner(snapshot_config=snapshot_config, max_cpus=n_envs) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=0.99, max_kl_step=0.01) runner.setup(algo=algo, env=env, sampler_cls=BatchSampler, sampler_args={'n_envs': n_envs}) runner.train(n_epochs=100, batch_size=4000, plot=False)
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env1 = normalize(PointEnv(goal=(-1., 0.), max_episode_length=100)) env2 = normalize(PointEnv(goal=(1., 0.), max_episode_length=100)) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=2048, plot=False)
from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.experiment import LocalRunner from garage.tf.algos import TRPO from garage.tf.policies import GaussianMLPPolicy from garage.tf.envs import TfEnv from sawyer.mujoco.reacher_env import SimpleReacherEnv with LocalRunner() as runner: env = TfEnv(normalize(SimpleReacherEnv(goal_position=[0.3, 0.3, 0.3]))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000, plot=True)
from garage.baselines import LinearFeatureBaseline from garage.contrib.alexbeloi.is_sampler import ISSampler from garage.envs import normalize from garage.tf.algos import TRPO from garage.tf.policies import GaussianMLPPolicy env = normalize(gym.make('InvertedPendulum-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict( # debug_nan=True, # reg_coeff=0.1, # cg_iters=2 ) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=200, discount=0.99, step_size=0.01, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1), optimizer_args=optimizer_args) algo.train()
from garage.tf.algos import TRPO import garage.tf.core.layers as L from garage.tf.envs import TfEnv from garage.tf.optimizers import ConjugateGradientOptimizer from garage.tf.optimizers import FiniteDifferenceHvp from garage.tf.policies import CategoricalLSTMPolicy env = TfEnv(env_name="CartPole-v1") policy = CategoricalLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.TfBasicLSTMLayer, # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train()
def test_is_sampler(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1, init_is=1)) runner._start_worker() paths = runner._sampler.obtain_samples(1) assert paths == [], 'Should return empty paths if no history' # test importance and live sampling get called alternatively with unittest.mock.patch.object(ISSampler, '_obtain_is_samples') as mocked: assert runner._sampler.obtain_samples(2, 20) mocked.assert_not_called() assert runner._sampler.obtain_samples(3) mocked.assert_called_once_with(3, None, True) # test importance sampling for first n_is_pretrain iterations with unittest.mock.patch.object(ISSampler, '_obtain_is_samples') as mocked: runner._sampler.n_is_pretrain = 5 runner._sampler.n_backtrack = None runner._sampler.obtain_samples(4) mocked.assert_called_once_with(4, None, True) runner._sampler.obtain_samples(5) # test random draw important samples runner._sampler.randomize_draw = True assert runner._sampler.obtain_samples(6, 1) runner._sampler.randomize_draw = False runner._sampler.obtain_samples(7, 30) # test ess_threshold use runner._sampler.ess_threshold = 500 paths = runner._sampler.obtain_samples(8, 30) assert paths == [], ( 'Should return empty paths when ess_threshold is large') runner._sampler.ess_threshold = 0 # test random sample selection when len(paths) > batch size runner._sampler.n_is_pretrain = 15 runner._sampler.obtain_samples(9, 10) runner._sampler.obtain_samples(10, 1) runner._shutdown_worker()
from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.envs.mujoco import SwimmerEnv from garage.tf.algos import TRPO import gym from garage.tf.envs import TfEnv from garage.tf.policies import GaussianMLPPolicy env = TfEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=500, n_itr=200, discount=0.99, step_size=0.01, plot=False) algo.train()
from garage.misc.instrument import run_experiment from garage.tf.algos import TRPO from garage.tf.policies import GaussianMLPPolicy from garage.tf.envs import TfEnv from sandbox.embed2learn.envs.mujoco import PR2ArmEnv env = TfEnv(normalize(PR2ArmEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_task(snapshot_config, *_, env_params, algo='TRPO', algo_params={}, epochs=1000, batch_size=4000, policy_hidden_sizes=(32,32), embed_state=False, model_dir='../models/reacher_limited/train_6', augment_embedded_state=False): """Run task.""" embed_config_file = os.path.join(model_dir, 'config.yaml') ckpt_path = os.path.join(model_dir, 'model_latest.ckpt') with LocalTFRunner(snapshot_config=snapshot_config) as runner: if embed_state: reacher_env = ReacherEmbeddedEnv( embed_config_file, ckpt_path, augment_embedded_state=augment_embedded_state, **env_params) else: reacher_env = ReacherEnv(**env_params) env = GarageEnv(reacher_env) policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=policy_hidden_sizes) # hidden_sizes=(32, 32)) #************** TRPO *************** if algo == 'TRPO': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, **algo_params) # max_path_length=100, # discount=0.99, # max_kl_step=0.01) #**************** PPO ********************* elif algo == 'PPO': baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(32, 32), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, # max_path_length=100, # discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10,), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, **algo_params) #**************** Other? ********************** else: print("ERROR: requested unrecognized algorithm: ", algo) raise NotImplementedError runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size)
import gym from garage.baselines import LinearFeatureBaseline from garage.experiment import run_experiment from garage.tf.algos import TRPO from garage.tf.envs import TfEnv from garage.tf.policies import CategoricalMLPPolicy # Need to wrap in a tf environment and force_reset to true # see https://github.com/openai/rllab/issues/87#issuecomment-282519288 env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=120, discount=0.99, max_kl_step=0.01, ) run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
from garage.baselines import LinearFeatureBaseline from garage.misc.instrument import run_experiment from garage.envs.mujoco import SwimmerEnv from garage.envs import normalize from garage.tf.algos import TRPO from garage.tf.envs import TfEnv from garage.tf.policies import GaussianMLPPolicy # Need to wrap in a tf environment and force_reset to true # see https://github.com/openai/garage/issues/87#issuecomment-282519288 env = TfEnv(normalize(SwimmerEnv)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, max_path_length=500, n_itr=100, discount=0.995, step_size=0.001, plot=False) run_experiment(algo.train(), n_parallel=8, snapshot_mode="last", seed=1)