def sac_setup(env, trainer, args): policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(args.buffer_size)) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, **convert_kwargs(args, SAC)) trainer.setup(algo=sac, env=env, sampler_cls=LocalSampler) return sac
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) trainer = Trainer(snapshot_config=ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1000, batch_size=1000)
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) trainer.setup(mtsac, env) trainer.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac.policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def torch_sac_half_cheetah(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = GarageEnv(normalize(gym.make('HalfCheetah-v2'))) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() runner.setup(algo=sac, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=1000, batch_size=1000)
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() ret = trainer.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 80
def test_sac_to(): """Test moving Sac between CPU and GPU.""" env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1, batch_size=100) log_alpha = torch.clone(sac._log_alpha).cpu() set_gpu_mode(False) sac.to() assert torch.allclose(log_alpha, sac._log_alpha)
def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch): """Check that if the num_tasks passed does not match the number of tasks in the environment, then the algorithm should raise an exception. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GymEnv(name, max_episode_length=150) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=None, gradient_steps_per_itr=150, eval_env=[env], env_spec=env.spec, num_tasks=4, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) error_string = ('The number of tasks in the environment does ' 'not match self._num_tasks. Are you sure that you passed ' 'The correct number of tasks?') obs = torch.Tensor([env.reset()[0]] * buffer_batch_size) with pytest.raises(ValueError, match=error_string): mtsac._get_log_alpha(dict(observation=obs))
def test_mtsac_get_log_alpha(monkeypatch): """Check that the private function _get_log_alpha functions correctly. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) for i, _ in enumerate(env_names): obs = torch.Tensor([env.reset()] * buffer_batch_size) log_alpha = mtsac._get_log_alpha(dict(observation=obs)) assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item() assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])
def test_mtsac_inverted_double_pendulum(): """Performance regression test of MTSAC on 2 InvDoublePendulum envs.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) trainer.setup(mtsac, env) ret = trainer.train(n_epochs=8, batch_size=128, plot=False) assert ret > 0
def create_qf_net(env_spec, net_params): net_type = net_params["net_type"] assert net_type in {"MLP", "Dendrite_MLP"} if net_type == "MLP": net = ContinuousMLPQFunction( env_spec=env_spec, hidden_sizes=net_params["qf_hidden_sizes"], hidden_nonlinearity=create_nonlinearity(net_params["qf_hidden_nonlinearity"]), output_nonlinearity=create_nonlinearity(net_params["qf_output_nonlinearity"]), ) elif net_type == "Dendrite_MLP": dendritic_layer_class = create_dendritic_layer(net_params["dendritic_layer_class"]) net = ContinuousDendriteMLPQFunction( env_spec=env_spec, hidden_sizes=net_params["hidden_sizes"], num_segments=net_params["num_segments"], dim_context=net_params["dim_context"], kw=net_params["kw"], kw_percent_on=net_params["kw_percent_on"], context_percent_on=net_params["context_percent_on"], weight_sparsity=net_params["weight_sparsity"], weight_init=net_params["weight_init"], dendrite_init=net_params["dendrite_init"], dendritic_layer_class=dendritic_layer_class, output_nonlinearity=net_params["output_nonlinearity"], preprocess_module_type=net_params["preprocess_module_type"], preprocess_output_dim=net_params["preprocess_output_dim"], representation_module_type=net_params["representation_module_type"], representation_module_dims=net_params["representation_module_dims"], ) else: raise NotImplementedError return net
def test_pickling(self): """Test pickle and unpickle.""" deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() pickled = pickle.dumps(td3) unpickled = pickle.loads(pickled) assert unpickled
def test_sac_inverted_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) assert ret > 85
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=100, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=1, fixed_alpha=np.exp(0.5)) trainer.setup(sac, env) sac.to() trainer.train(n_epochs=1, batch_size=100, plot=False) assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu()) assert not sac._use_automatic_entropy_tuning
def load_pearl(env_name="CartPole-v0"): """Return an instance of the PEARL algorithm. NOTE: currently not working. """ num_train_tasks = 100 num_test_tasks = 30 latent_size = 5 net_size = 300 encoder_hidden_size = 200 encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # Create multi-task environment and sample tasks. env_start = GarageEnv(env_name=env_name) env_sampler = SetTaskSampler(lambda: GarageEnv(normalize(env_start))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler(lambda: GarageEnv(normalize(env_start))) # Instantiate networks. augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL(env=env, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler) return pearl
def test_pickling(self): """Test pickle and unpickle.""" net_size = 10 env_sampler = SetTaskSampler(PointEnv) env = env_sampler.sample(5) test_env_sampler = SetTaskSampler(PointEnv) augmented_env = PEARL.augment_env_spec(env[0](), 5) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), 5, 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL(env=env, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=5, num_test_tasks=5, latent_dim=5, encoder_hidden_sizes=[10, 10], test_env_sampler=test_env_sampler) # This line is just to improve coverage pearl.to() pickled = pickle.dumps(pearl) unpickled = pickle.loads(pickled) assert hasattr(unpickled, '_replay_buffers') assert hasattr(unpickled, '_context_replay_buffers') assert unpickled._is_resuming
def test_td3_inverted_double_pendulum(self): deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(snapshot_config=snapshot_config) env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() trainer.setup(td3, env, sampler_cls=LocalSampler) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. lr (float): Learning rate for policy optimization. """ set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99}) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=500, batch_size=100)
def load_sac(env_name="MountainCarContinuous-v0"): """Return an instance of the SAC algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=[64, 64]) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, replay_buffer=replay_buffer) return algo
def test_output_shape(self, batch_size, hidden_sizes): env_spec = GymEnv(DummyBoxEnv()).spec obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = torch.ones(batch_size, obs_dim, dtype=torch.float32) act = torch.ones(batch_size, act_dim, dtype=torch.float32) qf = ContinuousMLPQFunction(env_spec=env_spec, hidden_nonlinearity=None, hidden_sizes=hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output = qf(obs, act) assert output.shape == (batch_size, 1)
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. lr (float): Learning rate for policy optimization. """ set_seed(seed) runner = LocalRunner(ctxt) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ deterministic.set_seed(0) trainer = Trainer(snapshot_config) env = normalize(GymEnv('InvertedPendulum-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 10 env.close()
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': 1e-4, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def load_ddpg(env_name="MountainCarContinuous-v0"): """Return an instance of the DDPG algorithm. Note: does this only work with continous? """ env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=[64, 64]) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer) return algo
def test_forward(self, hidden_sizes): env_spec = GymEnv(DummyBoxEnv()).spec obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0) act = torch.ones(act_dim, dtype=torch.float32).unsqueeze(0) qf = ContinuousMLPQFunction(env_spec=env_spec, hidden_nonlinearity=None, hidden_sizes=hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output = qf(obs, act) expected_output = torch.full([1, 1], fill_value=(obs_dim + act_dim) * np.prod(hidden_sizes), dtype=torch.float32) assert torch.eq(output, expected_output)
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 10 env.close()
def test_is_pickleable(self, hidden_sizes): env_spec = GymEnv(DummyBoxEnv()).spec obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0) act = torch.ones(act_dim, dtype=torch.float32).unsqueeze(0) qf = ContinuousMLPQFunction(env_spec=env_spec, hidden_nonlinearity=None, hidden_sizes=hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output1 = qf(obs, act) p = pickle.dumps(qf) qf_pickled = pickle.loads(p) output2 = qf_pickled(obs, act) assert torch.eq(output1, output2)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" runner = LocalRunner() env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, policy_lr=1e-4, qf_lr=1e-3, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 60 env.close()
def create_qf_net(env_spec, net_params): if net_params.net_type == "MLP": net = ContinuousMLPQFunction( env_spec=env_spec, hidden_sizes=net_params.hidden_sizes, hidden_nonlinearity=create_nonlinearity( net_params.qf_hidden_nonlinearity), output_nonlinearity=create_nonlinearity( net_params.output_nonlinearity), ) elif net_params.net_type == "Dendrite_MLP": dendritic_layer_class = create_dendritic_layer( net_params.dendritic_layer_class) net = ContinuousDendriteMLPQFunction( env_spec=env_spec, num_tasks=net_params.num_tasks, input_data=net_params.input_data, context_data=net_params.context_data, hidden_sizes=net_params.hidden_sizes, layers_modulated=net_params.layers_modulated, num_segments=net_params.num_segments, kw_percent_on=net_params.kw_percent_on, context_percent_on=net_params.context_percent_on, weight_sparsity=net_params.weight_sparsity, weight_init=net_params.weight_init, dendrite_weight_sparsity=net_params.dendrite_weight_sparsity, dendrite_init=net_params.dendrite_init, dendritic_layer_class=dendritic_layer_class, output_nonlinearity=net_params.output_nonlinearity, preprocess_module_type=net_params.preprocess_module_type, preprocess_output_dim=net_params.preprocess_output_dim, ) else: raise NotImplementedError return net
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, max_path_length=100, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()