def setup_method(self): super().setup_method() self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, hidden_sizes=(32, 32), )
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def ppo_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = TF_GMP( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = TF_GMB( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=3e-4, ), ) algo = TF_PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_epochs=10, learning_rate=3e-4, verbose=True)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def ppo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) env = GarageEnv(normalize(gym.make(env_id))) policy = PyTorch_GMP(env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=hyper_parameters['max_path_length'], discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 85
def ddpg_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow DDPG model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise( env.spec, policy, sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer( capacity_in_transitions=hyper_parameters['replay_buffer_size']) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_rollout_steps'])
def test_obs_not_image(self): env = GarageEnv(DummyDiscretePixelEnv()) with mock.patch(('tests.fixtures.models.SimpleCNNModel._build'), autospec=True, side_effect=SimpleCNNModel._build) as build: with mock.patch(('garage.tf.models.' 'cnn_mlp_merge_model.CNNModel'), new=SimpleCNNModel): with mock.patch(('garage.tf.models.' 'cnn_mlp_merge_model.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousCNNQFunction(env_spec=env.spec, filters=((5, (3, 3)), ), strides=(1, )) # ensure non-image obses are not normalized # in _initialize() and get_qval() normalized_obs = build.call_args_list[0][0][1] assert normalized_obs == qf.inputs[0] fake_obs = [ np.full(env.spec.observation_space.shape, 255.) ] assert (self.sess.run(normalized_obs, feed_dict={qf.inputs[0]: fake_obs}) == 255.).all() # ensure non-image obses are not normalized # in get_qval_sym() obs_dim = env.spec.observation_space.shape state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + obs_dim) act_dim = env.spec.observation_space.shape action_input = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + act_dim) qf.get_qval_sym(state_input, action_input, name='another') normalized_obs = build.call_args_list[1][0][1] assert (self.sess.run(normalized_obs, feed_dict={state_input: fake_obs}) == 255.).all()
def test_output_shape(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() outputs = qf.get_qval([obs], [act]) assert outputs.shape == (1, 1)
def test_polopt_algo(self, algo_cls, env_cls, policy_cls, baseline_cls): logger.log('Testing {}, {}, {}'.format( algo_cls.__name__, env_cls.__name__, policy_cls.__name__)) env = GarageEnv(env_cls()) policy = policy_cls(env_spec=env) baseline = baseline_cls(env_spec=env) algo = algo_cls( env=env, policy=policy, baseline=baseline, **(algo_args.get(algo_cls, dict()))) algo.train() assert not np.any(np.isnan(policy.get_param_values())) env.close()
def gaussian_cnn_baseline(ctxt, env_id, seed): """Create Gaussian CNN Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, flatten_input=False, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['batch_size'])
def test_get_action(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_get_action(self, obs_dim, action_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalMLPPolicy(env_spec=env.spec) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def _init_multi_env_wrapper(self, env_names, sample_strategy=uniform_random_strategy): """helper function to initialize multi_env_wrapper Args: env_names (list(str)): List of gym.Env names. sample_strategy (func): A sampling strategy. Returns: garage.envs.multi_env_wrapper: Multi env wrapper. """ task_envs = [GarageEnv(env_name=name) for name in env_names] return MultiEnvWrapper(task_envs, sample_strategy=sample_strategy)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_mtsac_get_log_alpha(monkeypatch): """Check that the private function _get_log_alpha functions correctly. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) for i, _ in enumerate(env_names): obs = torch.Tensor([env.reset()] * buffer_batch_size) log_alpha = mtsac._get_log_alpha(dict(observation=obs)) assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item() assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_params['max_path_length'], discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) runner.setup(algo, env, sampler_args=dict(n_envs=hyper_params['n_envs'])) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def test_output_shape(self, batch_size, hidden_sizes): env_spec = GarageEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = torch.ones(batch_size, obs_dim, dtype=torch.float32) act = torch.ones(batch_size, act_dim, dtype=torch.float32) qf = ContinuousMLPQFunction(env_spec=env_spec, hidden_nonlinearity=None, hidden_sizes=hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output = qf(obs, act) assert output.shape == (batch_size, 1)
def test_get_action_img_obs(self, hidden_channels, kernel_sizes, strides, hidden_sizes): """Test get_action function with akro.Image observation space.""" env = GarageEnv(DummyDiscretePixelEnv(), is_image=True) env = self._initialize_obs_env(env) policy = CategoricalCNNPolicy(env=env, kernel_sizes=kernel_sizes, hidden_channels=hidden_channels, strides=strides, hidden_sizes=hidden_sizes) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs) assert env.action_space.contains(action)
def test_build(self, obs_dim, action_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) qf = DiscreteMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) output1 = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs]}) input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + obs_dim) q_vals = qf.build(input_var, 'another') output2 = self.sess.run(q_vals, feed_dict={input_var: [obs]}) assert np.array_equal(output1, output2)
def stest_cma_es_cartpole(): """Test CMAES with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: with TensorBoardPytorchWriter(PROJECT_APP_PATH.user_log / "CMA") as writer: env = GarageEnv(env_name="CartPole-v1") algo = CovarianceMatrixAdaptationEvolutionStrategyAgent( env_spec=env.spec, max_rollout_length=100) algo.build() runner.setup(algo, env, sampler_cls=LocalSampler) runner.train(n_epochs=1, batch_size=1000) env.close()
def tutorial_cem(ctxt=None): """Train CEM with Cartpole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. """ set_seed(100) with LocalTFRunner(ctxt) as runner: env = GarageEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(env.spec) algo = SimpleCEM(env.spec, policy) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=1000)
def tutorial_vpg(ctxt=None): """Train VPG with PointEnv environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. """ set_seed(100) runner = LocalRunner(ctxt) env = GarageEnv(PointEnv()) policy = GaussianMLPPolicy(env.spec) algo = SimpleVPG(env.spec, policy) runner.setup(algo, env) runner.train(n_epochs=200, batch_size=4000)
def test_get_embedding(self, obs_dim, embedding_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=embedding_dim)) embedding_spec = InOutSpec(input_space=env.spec.observation_space, output_space=env.spec.action_space) embedding = GaussianMLPEncoder(embedding_spec) task_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, embedding.input_dim)) embedding.build(task_input) env.reset() obs, _, _, _ = env.step(1) latent, _ = embedding.forward(obs) assert env.action_space.contains(latent)
def test_one_hot_observation_space(self): """test one hot representation of observation space""" envs = ['CartPole-v0', 'CartPole-v1'] mt_env = self._init_multi_env_wrapper(envs) cartpole = GarageEnv(env_name='CartPole-v0') cartpole_lb, cartpole_ub = cartpole.observation_space.bounds obs_space = akro.Box(np.concatenate([cartpole_lb, np.zeros(2)]), np.concatenate([cartpole_ub, np.ones(2)])) assert mt_env.observation_space.shape == obs_space.shape assert ( mt_env.observation_space.bounds[0] == obs_space.bounds[0]).all() assert ( mt_env.observation_space.bounds[1] == obs_space.bounds[1]).all()
def test_build(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianMLPPolicy(env_spec=env.spec) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist dist_sym2 = policy.build(state_input, name='dist_sym2').dist output1 = self.sess.run([dist_sym.loc], feed_dict={state_input: [[obs.flatten()]]}) output2 = self.sess.run([dist_sym2.loc], feed_dict={state_input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_no_seed(): max_path_length = 16 env = GarageEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) n_workers = 8 workers = WorkerFactory(seed=None, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) max_episode_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = GarageEnv(PointEnv()) algo = OptimalActionInference(env=env, max_episode_length=max_episode_length) runner.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_episode_length=max_episode_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float( rows[0]['MetaTest/__unnamed_task__/TerminationRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. lr (float): Learning rate for policy optimization. """ set_seed(seed) runner = LocalRunner(ctxt) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def ddpg_pendulum(ctxt=None, seed=1): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, max_path_length=100, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalLSTMPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset() obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)