def dqn_cartpole(ctxt=None, seed=1): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GarageEnv(gym.make('CartPole-v0')) replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, max_episode_length=100, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=100, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=1, fixed_alpha=np.exp(0.5)) runner.setup(sac, env, sampler_cls=LocalSampler) sac.to() runner.train(n_epochs=1, batch_size=100, plot=False) assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu()) assert not sac._use_automatic_entropy_tuning
def test_pickling(self): """Test pickle and unpickle.""" deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() pickled = pickle.dumps(td3) unpickled = pickle.loads(pickled) assert unpickled
def trpo_cubecrash(ctxt=None, seed=1, batch_size=4000): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 10 env.close()
def categorical_lstm_policy(ctxt, env_id, seed): """Create Categorical LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = normalize(GymEnv(env_id)) policy = CategoricalLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_episode_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_episode_length=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048)
def tf_gym_music(ctxt=None, seed=1): """Train Policy Gradient LSTM with Music-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. created by @wrap_experiment seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv(MusicEnv(monitor = HeartMonitor('DC:39:39:66:26:1F')),max_episode_length = 35) policy = GaussianLSTMPolicy(name='policy', env_spec=env.spec, hidden_dim= 32) baseline = GaussianMLPBaseline( env_spec = env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=False, n_workers = 1, ) algo = NPO(env_spec = env.spec, policy = policy, baseline = baseline, sampler = sampler, ) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=1,store_episodes = True)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) trainer = Trainer(snapshot_config) env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()
def vpg_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow VPG model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TF_VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv'], optimizer_args=dict( learning_rate=hyper_parameters['learning_rate'], )) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def __init__(self, algo, env, seed, should_render=False, num_processors=None, sampler_worker_cls=None): self._sampler_worker = ray.remote(SamplerWorker if sampler_worker_cls is None else sampler_worker_cls) self._env = env self._algo = algo self._seed = seed deterministic.set_seed(self._seed) self._max_path_length = self._algo.max_path_length self._should_render = should_render if not ray.is_initialized(): ray.init(log_to_driver=False) self._num_workers = (num_processors if num_processors else psutil.cpu_count(logical=False)) self._all_workers = defaultdict(None)
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def trpo_pendulum_ray_sampler(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_webui=False) deterministic.set_seed(seed) env = GarageEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env, sampler_cls=RaySampler) runner.train(n_epochs=100, batch_size=1024)
def test_dqn_cartpole_grad_clip(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GarageEnv(gym.make('CartPole-v0')) replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, max_path_length=100, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 9 env.close()
def trpo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch TRPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv(env_id)) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PyTorch_TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64, 64), conv_filter_sizes=(5, 3, 2), conv_strides=(4, 2, 1), conv_pad='VALID', hidden_sizes=(256, )) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64, 64), filter_dims=(5, 3, 2), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=batch_size)
def trpo_swimmer_ray_sampler(ctxt=None, seed=1): """tf_trpo_swimmer. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(_memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_dashboard=False) with TFTrainer(snapshot_config=ctxt) as trainer: set_seed(seed) env = GymEnv('Swimmer-v2') policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=4000)
def test_td3_inverted_double_pendulum(self): deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(snapshot_config=snapshot_config) env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() trainer.setup(td3, env, sampler_cls=LocalSampler) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def ppo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) env = TfEnv(normalize(gym.make(env_id))) policy = PyTorch_GMP(env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_functions = LinearFeatureBaseline(env_spec=env.spec) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, value_function=value_functions, optimizer=torch.optim.Adam, policy_lr=3e-4, max_path_length=hyper_parameters['max_path_length'], discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2, minibatch_size=128, max_optimization_epochs=10) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_state_dict_to(): """Test state_dict_to""" set_seed(42) # Using tensor instead of Tensor so it can be declared on GPU # pylint: disable=not-callable expected = collections.OrderedDict([ ('_module._layers.0.linear.weight', tensor([[ 0.13957974, -0.2693157, -0.19351028, 0.09471931, -0.43573233, 0.03590716, -0.4272097, -0.13935488, -0.35843086, -0.25814268, 0.03060348 ], [ 0.20623916, -0.1914061, 0.46729338, -0.5437773, -0.50449526, -0.55039907, 0.0141218, -0.02489783, 0.26499796, -0.03836302, 0.7235093 ]], device='cuda:0')), ('_module._layers.0.linear.bias', tensor([0., 0.], device='cuda:0')), ('_module._layers.1.linear.weight', tensor([[-0.7181905, -0.6284401], [0.10591025, -0.14771031]], device='cuda:0')), ('_module._layers.1.linear.bias', tensor([0., 0.], device='cuda:0')), ('_module._output_layers.0.linear.weight', tensor([[-0.29133463, 0.58353233]], device='cuda:0')), ('_module._output_layers.0.linear.bias', tensor([0.], device='cuda:0')) ]) # pylint: enable=not-callable env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[2, 2], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) moved_state_dict = state_dict_to(policy.state_dict(), 'cuda') assert np.all([ torch.allclose(expected[key], moved_state_dict[key]) for key in expected.keys() ]) assert np.all( [moved_state_dict[key].is_cuda for key in moved_state_dict.keys()])
def trpo_cartpole_batch_sampler(ctxt=None, seed=1, batch_size=4000, max_path_length=100): """Train TRPO with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. max_path_length (int): Number of timesteps to truncate paths to. """ set_seed(seed) n_envs = batch_size // max_path_length with LocalTFRunner(ctxt, max_cpus=n_envs) as runner: env = GarageEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=0.99, max_kl_step=0.01) runner.setup(algo=algo, env=env, sampler_cls=BatchSampler, sampler_args={'n_envs': n_envs}) runner.train(n_epochs=100, batch_size=4000, plot=False)
def trpo_cubecrash(ctxt=None, seed=1, max_episode_length=5, batch_size=4000): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = normalize( GymEnv('CubeCrash-v0', max_episode_length=max_episode_length)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def test_trpo_gru_cartpole(self): deterministic.set_seed(2) with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CartPole-v1', max_episode_length=100)) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 40 env.close()
def setup(): set_seed(24) n_epochs = 11 steps_per_epoch = 10 sampler_batch_size = 512 num_timesteps = 100 * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, double_q=False, discount=0.9, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64) return algo, env, replay_buffer, n_epochs, sampler_batch_size
def test_meta_evaluator_n_traj(): set_seed(100) tasks = SetTaskSampler(PointEnv) max_path_length = 200 env = GarageEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) algo = MockAlgo(env, max_path_length, n_traj) runner.setup(algo, env) meta_eval = MetaEvaluator(runner, test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo)
def test_dqn_cartpole_double_q(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with TFTrainer(snapshot_config, sess=self.sess) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 8.8 env.close()
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = TfEnv(normalize(gym.make('Adventure-ram-v4'))) env2 = TfEnv(normalize(gym.make('Alien-ram-v4'))) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def run_garage_tf(env, seed, log_dir): """Create garage TensorFlow VPG model and training.""" deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TF_VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv'], optimizer_args=dict( tf_optimizer_args=dict( learning_rate=hyper_parameters['learning_rate']), verbose=True)) # yapf: disable # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=2048) dowel_logger.remove_all() return tabular_log_file
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) episodes_per_task = 5 max_episode_length = self.env.spec.max_episode_length task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, n_test_tasks=1, n_test_episodes=10) sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length) trainer = Trainer(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, sampler=sampler, task_sampler=self.task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, self.env) last_avg_ret = trainer.train(n_epochs=10, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5
def test_bc_point_deterministic(ray_local_session_fixture): # NOQA del ray_local_session_fixture assert ray.is_initialized() deterministic.set_seed(100) trainer = Trainer(snapshot_config) goal = np.array([1., 1.]) env = PointEnv(goal=goal, max_episode_length=200) expert = OptimalPolicy(env.spec, goal=goal) policy = DeterministicMLPPolicy(env.spec, hidden_sizes=[8, 8]) batch_size = 600 sampler = LocalSampler(agents=expert, envs=env, max_episode_length=env.spec.max_episode_length) algo = BC(env.spec, policy, batch_size=batch_size, source=expert, sampler=sampler, policy_lr=1e-2, loss='mse') trainer.setup(algo, env) run_bc(trainer, algo, batch_size)