def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def setup_method(self): super().setup_method() self.env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec) self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, hidden_sizes=(32, 32), ) self.sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length, is_tf_worker=True)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySamplerTF, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
def test_erwr_cartpole(self): """Test ERWR with Cartpole environment.""" logger.reset() env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=10, discount=0.99) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 100
def run_task(snapshot_config, *_): """Run the job.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=200, batch_size=4000)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='Pusher3DOF-v1') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32), init_std=10) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=200, batch_size=50*250)
def trpo_swimmer_ray_sampler(ctxt=None, seed=1): """tf_trpo_swimmer. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(_memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_dashboard=False) with TFTrainer(snapshot_config=ctxt) as trainer: set_seed(seed) env = GymEnv('Swimmer-v2') policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=4000)
def run_task(v): v = SimpleNamespace(**v) with LocalRunner() as runner: # Environment env = FlatTorqueReacher( fix_goal=True, fixed_goal=GOALS[0], reward_type="hand_distance", # hand_distance_completion_bonus=0., # torque_limit_pct=0.2, indicator_threshold=0.03, # velocity_penalty_coeff=0.01, action_scale=10.0, # hide_goal_pos=True, ) env = TfEnv(normalize(env)) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=v.max_path_length, discount=0.99, max_kl_step=0.01, #optimizer_args=dict(max_grad_norm=0.5) ) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=v.batch_size, plot=True)
def test_tnpg_cartpole(self): """Test TNPG with Cartpole environment.""" logger.reset() env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=10, discount=0.99, optimizer_args=dict(reg_coeff=5e-2)) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40
def test_npo_unknown_pg_loss(self): """Test NPO with unkown policy gradient loss.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) with self.assertRaises(NotImplementedError) as context: NPO( env=env, policy=policy, baseline=baseline, pg_loss="random pg_loss", ) assert "Unknown PGLoss" in str(context.exception)
def run_task(*_): env = TfEnv(normalize(PointEnv(goal=(-1, 0)))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def trpo_swimmer_ray_sampler(ctxt=None, seed=1): """tf_trpo_swimmer. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_webui=False) with LocalTFRunner(snapshot_config=ctxt) as runner: set_seed(seed) env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySampler, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
def run_task(*_): with LocalRunner() as runner: env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(32, 32), use_trust_region=True, ), ) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, ), plot=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def run_task(snapshot_config, *_): """Train CEM""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='Swimmer-v2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) # NOTE: make sure that n_epoch_cycles == n_samples ! runner.train(n_epochs=100, batch_size=1000, n_epoch_cycles=n_samples)
def test_tnpg_inverted_pendulum(self): """Test TNPG with InvertedPendulum-v2 environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('InvertedPendulum-v2')) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(reg_coeff=5e-1)) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 15 env.close()
def test_rl2_sampler_invalid_num_of_env_again(self): with pytest.raises( ValueError, match='n_envs must be a multiple of meta_batch_size'): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size + 1)) runner._start_worker() runner._sampler.obtain_samples(0)
def test_tnpg_inverted_pendulum(self): """Test TNPG with InvertedPendulum-v2 environment.""" env = TfEnv(normalize(gym.make("InvertedPendulum-v2"))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=10, discount=0.99, optimizer_args=dict(reg_coeff=5e-1)) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 30 env.close()
def test_ppo_pendulum_continuous_baseline(self): """Test PPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def test_tnpg_inverted_pendulum(self): """Test TNPG with InvertedPendulum-v2 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GarageEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(reg_coeff=5e-1)) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 15 env.close()
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env1 = normalize(PointEnv(goal=(-1., 0.), max_episode_length=100)) env2 = normalize(PointEnv(goal=(1., 0.), max_episode_length=100)) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=2048, plot=False)
def run_task(vv): env = TfEnv(normalize(gym.make('HalfCheetah-v1'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=vv["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting # plot=True, ) algo.train()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with self.graph.as_default(): env = TfEnv(DmControlEnv(domain_name=task[0], task_name=task[1])) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=5, n_itr=1, discount=0.99, step_size=0.01, ) algo.train()
def run_task(*_): """ Wrap PPO training task in the run_task function. :param _: :return: """ env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO(env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, step_size=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False) algo.train()
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_get_action(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = GaussianMLPPolicy(env_spec=env.spec) policy.build(obs_var) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalRunner() as runner: env = TfEnv(normalize(env)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), plot=False, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') garage_logger.add_output(StdOutput()) garage_logger.add_output(CsvOutput(tabular_log_file)) garage_logger.add_output(TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=488, batch_size=2048) garage_logger.remove_all() return tabular_log_file
from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.misc.instrument import stub from garage.misc.instrument import run_experiment from garage.tf.algos import TRPO from garage.tf.policies import GaussianMLPPolicy from garage.tf.envs import TfEnv from sandbox.embed2learn.envs.mujoco import PR2ArmEnv env = TfEnv(normalize(PR2ArmEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=True,
class TestGaussianMLPPolicyWithModelTransit(TfGraphTestCase): def setup_method(self): with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 super().setup_method() self.box_env = TfEnv(DummyBoxEnv()) self.policy1 = GaussianMLPPolicy(env_spec=self.box_env, init_std=1.0, name='P1') self.policy2 = GaussianMLPPolicy(env_spec=self.box_env, init_std=1.2, name='P2') self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env, init_std=1.0, name='P3') self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env, init_std=1.2, name='P4') self.sess.run(tf.global_variables_initializer()) for a, b in zip(self.policy3.get_params(), self.policy1.get_params()): self.sess.run(tf.assign(b, a)) for a, b in zip(self.policy4.get_params(), self.policy2.get_params()): self.sess.run(tf.assign(b, a)) self.obs = [self.box_env.reset()] self.obs_ph = tf.placeholder( tf.float32, shape=(None, self.box_env.observation_space.flat_dim)) self.action_ph = tf.placeholder( tf.float32, shape=(None, self.box_env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph, name='p4_sym') assert self.policy1.vectorized == self.policy2.vectorized assert self.policy3.vectorized == self.policy4.vectorized def test_dist_info_sym_output(self): dist1 = self.sess.run(self.dist1_sym, feed_dict={self.obs_ph: self.obs}) dist2 = self.sess.run(self.dist2_sym, feed_dict={self.obs_ph: self.obs}) dist3 = self.sess.run(self.dist3_sym, feed_dict={self.obs_ph: self.obs}) dist4 = self.sess.run(self.dist4_sym, feed_dict={self.obs_ph: self.obs}) assert np.array_equal(dist1['mean'], dist3['mean']) assert np.array_equal(dist1['log_std'], dist3['log_std']) assert np.array_equal(dist2['mean'], dist4['mean']) assert np.array_equal(dist2['log_std'], dist4['log_std']) @mock.patch('numpy.random.normal') def test_get_action(self, mock_rand): mock_rand.return_value = 0.5 action1, _ = self.policy1.get_action(self.obs) action2, _ = self.policy2.get_action(self.obs) action3, _ = self.policy3.get_action(self.obs) action4, _ = self.policy4.get_action(self.obs) assert np.array_equal(action1, action3) assert np.array_equal(action2, action4) actions1, dist_info1 = self.policy1.get_actions([self.obs]) actions2, dist_info2 = self.policy2.get_actions([self.obs]) actions3, dist_info3 = self.policy3.get_actions([self.obs]) actions4, dist_info4 = self.policy4.get_actions([self.obs]) assert np.array_equal(actions1, actions3) assert np.array_equal(actions2, actions4) assert np.array_equal(dist_info1['mean'], dist_info3['mean']) assert np.array_equal(dist_info1['log_std'], dist_info3['log_std']) assert np.array_equal(dist_info2['mean'], dist_info4['mean']) assert np.array_equal(dist_info2['log_std'], dist_info4['log_std']) def test_kl_sym(self): kl_diff_sym1 = self.policy1.distribution.kl_sym( self.dist1_sym, self.dist2_sym) objective1 = tf.reduce_mean(kl_diff_sym1) kl_func = tensor_utils.compile_function([self.obs_ph], objective1) kl1 = kl_func(self.obs, self.obs) kl_diff_sym2 = self.policy3.distribution.kl_sym( self.dist3_sym, self.dist4_sym) objective2 = tf.reduce_mean(kl_diff_sym2) kl_func = tensor_utils.compile_function([self.obs_ph], objective2) kl2 = kl_func(self.obs, self.obs) assert np.array_equal(kl1, kl2) assert kl1 == pytest.approx(kl2) def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func(self.obs, [[1, 1]]) log_prob_sym2 = self.policy3.model.networks[ 'default'].dist.log_likelihood_sym(self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2(self.obs, [[1, 1]]) assert log_prob1 == log_prob2 log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func(self.obs, [[1, 1]]) log_prob_sym2 = self.policy4.model.networks[ 'default'].dist.log_likelihood_sym(self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2(self.obs, [[1, 1]]) assert log_prob1 == log_prob2 def test_policy_entropy_sym(self): entropy_sym1 = self.policy1.distribution.entropy_sym( self.dist1_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym1) entropy1 = entropy_func(self.obs) entropy_sym2 = self.policy3.distribution.entropy_sym( self.dist3_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym2) entropy2 = entropy_func(self.obs) assert entropy1 == entropy2 def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func([[1, 1]], self.obs) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func([[1, 1]], self.obs) assert likelihood_ratio1 == likelihood_ratio2
""" Example using TRPO with ISSampler, iterations alternate between live and importance sampled iterations. """ import gym from garage.baselines import LinearFeatureBaseline from garage.contrib.alexbeloi.is_sampler import ISSampler from garage.envs import normalize from garage.tf.algos import TRPO from garage.tf.policies import GaussianMLPPolicy env = normalize(gym.make('InvertedPendulum-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict( # debug_nan=True, # reg_coeff=0.1, # cg_iters=2 ) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=200, discount=0.99,
def test_clone(self): env = GarageEnv(DummyBoxEnv(obs_dim=(10, ), action_dim=(4, ))) policy = GaussianMLPPolicy(env_spec=env.spec) policy_clone = policy.clone('GaussnaMLPPolicyClone') assert policy.env_spec == policy_clone.env_spec