def run_task(*_): """Run the job.""" with LocalRunner() as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=40, batch_size=4000)
def test_make_sampler_ray_sampler(self, ray_session_fixture): del ray_session_fixture assert ray.is_initialized() with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CubeCrash-v0') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env, sampler_cls=RaySampler) assert isinstance(runner._sampler, RaySampler) runner.train(n_epochs=1, batch_size=10)
def run_task(snapshot_config, *_): """Train CEM with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) # NOTE: make sure that n_epoch_cycles == n_samples ! runner.train(n_epochs=100, batch_size=1000, n_epoch_cycles=n_samples)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def trpo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch TRPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) env = TfEnv(normalize(gym.make(env_id))) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PyTorch_TRPO(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda']) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: n_epochs = 10 n_epoch_cycles = 10 sampler_batch_size = 500 num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, n_epoch_cycles=n_epoch_cycles, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) runner.train(n_epochs=n_epochs, n_epoch_cycles=n_epoch_cycles, batch_size=sampler_batch_size)
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimplePusherEnv(action_scale=0.04, control_method="position_control", completion_bonus=0.1, collision_penalty=0.05) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(256, 128), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(256, 128)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=2000, discount=0.99, step_size=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: # env = TfEnv(normalize(MassSpringEnv_OptL_HwAsAction(params), normalize_action=False, normalize_obs=False, normalize_reward=True, reward_alpha=0.1)) env = TfEnv(MassSpringEnv_OptL_HwAsAction(params)) zip_project(log_dir=runner._snapshotter._snapshot_dir) comp_policy_model = MLPModel(output_dim=1, hidden_sizes=params.comp_policy_network_size, hidden_nonlinearity=None, output_nonlinearity=None, ) mech_policy_model = MechPolicyModel_OptL_HwAsAction(params) policy = CompMechPolicy_OptL_HwAsAction(name='comp_mech_policy', env_spec=env.spec, comp_policy_model=comp_policy_model, mech_policy_model=mech_policy_model) # baseline = GaussianMLPBaseline( # env_spec=env.spec, # regressor_args=dict( # hidden_sizes=params.baseline_network_size, # hidden_nonlinearity=tf.nn.tanh, # use_trust_region=True, # ), # ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CMAES(env_spec=env.spec, policy=policy, baseline=baseline, **params.cmaes_algo_kwargs) runner.setup(algo, env) runner.train(**params.cmaes_train_kwargs)
def run_task(vv): env = TfEnv(normalize(gym.make('HalfCheetah-v1'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=vv["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting # plot=True, ) algo.train()
def test_is_pickleable(self): box_env = TfEnv(DummyBoxEnv(obs_dim=(1, ))) with mock.patch(('garage.tf.baselines.' 'gaussian_mlp_baseline_with_model.' 'GaussianMLPRegressorWithModel'), new=SimpleGaussianMLPRegressor): gmb = GaussianMLPBaselineWithModel(env_spec=box_env.spec) obs = {'observations': [np.full(1, 1), np.full(1, 1)]} with tf.compat.v1.variable_scope('GaussianMLPBaselineWithModel', reuse=True): return_var = tf.compat.v1.get_variable( 'SimpleGaussianMLPModel/return_var') return_var.load(1.0) prediction = gmb.predict(obs) h = pickle.dumps(gmb) with tf.compat.v1.Session(graph=tf.Graph()): gmb_pickled = pickle.loads(h) prediction2 = gmb_pickled.predict(obs) assert np.array_equal(prediction, prediction2)
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = TfEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = RaySampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all()
def run_task(snapshot_config, *_): with LocalRunner(snapshot_config=snapshot_config, max_cpus=n_envs) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=0.99, max_kl_step=0.01) runner.setup(algo=algo, env=env, sampler_cls=BatchSampler, sampler_args={'n_envs': n_envs}) runner.train(n_epochs=100, batch_size=4000, plot=False)
def run_task(snapshot_config, *_): """Run the job.""" env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, optimizer=torch.optim.Adam, baseline=baseline, max_path_length=100, discount=0.99, center_adv=False, policy_lr=1e-2) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def test_tf_batch_sampler(self): max_cpus = 8 with LocalTFRunner(snapshot_config, max_cpus=max_cpus) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=1, discount=0.99) runner.setup(algo, env, sampler_cls=BatchSampler, sampler_args={'n_envs': max_cpus}) try: runner.initialize_tf_vars() except BaseException: raise AssertionError( 'LocalRunner should be able to initialize tf variables.') runner._start_worker() paths = runner._sampler.obtain_samples(0, batch_size=8, whole_paths=True) assert len(paths) >= max_cpus, ( 'BatchSampler should sample more than max_cpus={} ' 'trajectories'.format(max_cpus))
def test_make_sampler_batch_sampler(self): with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env, sampler_cls=BatchSampler, sampler_args=dict(n_envs=3)) assert isinstance(runner._sampler, BatchSampler) runner.train(n_epochs=1, batch_size=10)
def vpg_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch VPG model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) env = TfEnv(normalize(gym.make(env_id))) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = LinearFeatureBaseline(env_spec=env.spec) algo = PyTorch_VPG(env_spec=env.spec, policy=policy, optimizer=torch.optim.Adam, policy_lr=hyper_parameters['learning_rate'], value_function=value_function, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv']) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_obs_is_image(self): env = TfEnv(DummyDiscretePixelEnv(), is_image=True) with mock.patch(('garage.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): with mock.patch( 'garage.tf.baselines.' 'gaussian_cnn_baseline.' 'normalize_pixel_batch', side_effect=normalize_pixel_batch) as npb: gcb = GaussianCNNBaseline(env_spec=env.spec) obs_dim = env.spec.observation_space.shape paths = [{ 'observations': [np.full(obs_dim, 1)], 'returns': [1] }, { 'observations': [np.full(obs_dim, 2)], 'returns': [2] }] gcb.fit(paths) observations = np.concatenate( [p['observations'] for p in paths]) npb.assert_called_once() assert (npb.call_args_list[0][0][0] == observations).all() obs = { 'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)] } observations = obs['observations'] gcb.predict(obs) assert npb.call_args_list[1][0][0] == observations
def run_task(*_): """ Wrap PPO training task in the run_task function. :param _: :return: """ env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO(env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, step_size=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False) algo.train()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with self.graph.as_default(): env = TfEnv(DmControlEnv(domain_name=task[0], task_name=task[1])) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=5, n_itr=1, discount=0.99, step_size=0.01, ) algo.train()
def test_is_pickleable(self, batch_size, hidden_sizes): """Test if policy is unchanged after pickling.""" env_spec = TfEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim obs = torch.ones([batch_size, obs_dim], dtype=torch.float32) init_std = 2. policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=hidden_sizes, init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output1_action, output1_prob = policy.get_actions(obs) p = pickle.dumps(policy) policy_pickled = pickle.loads(p) output2_action, output2_prob = policy_pickled.get_actions(obs) assert np.allclose(output2_prob['mean'], output1_prob['mean'], rtol=1e-3) assert output1_action.shape == output2_action.shape
def run_task(*_): """Train CMA_ES with Cartpole-v1 environment.""" with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 20 algo = CMAES(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) # NOTE: make sure that n_epoch_cycles == n_samples ! runner.train(n_epochs=100, batch_size=1000, n_epoch_cycles=n_samples)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalRunner() as runner: env = TfEnv(env) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) garage_logger.add_output(StdOutput()) garage_logger.add_output(CsvOutput(tabular_log_file)) garage_logger.add_output(TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) garage_logger.remove_all() return tabular_log_file
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) env.reset() with LocalRunner() as runner: env = TfEnv(normalize(env)) action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') logger.add_output(dowel.StdOutput()) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) logger.remove_all() return tabular_log_file
class TestTRPO(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) @pytest.mark.large @pytest.mark.mujoco def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = TRPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 50 @pytest.mark.mujoco def test_trpo_unknown_kl_constraint(self): """Test TRPO with unkown KL constraints.""" with pytest.raises(ValueError, match='Invalid kl_constraint'): TRPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, kl_constraint='random kl_constraint', ) @pytest.mark.large @pytest.mark.mujoco def test_trpo_soft_kl_constraint(self): """Test TRPO with unkown KL constraints.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = TRPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, kl_constraint='soft') runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 45 @pytest.mark.large @pytest.mark.mujoco def test_trpo_lstm_cartpole(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) snapshotter.snapshot_dir = './' runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close() @pytest.mark.large @pytest.mark.mujoco def test_trpo_gru_cartpole(self): deterministic.set_seed(2) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close() def teardown_method(self): self.env.close() super().teardown_method()
def setup_method(self): self.env = TfEnv(normalize(gym.make('CartPole-v1'))) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
def test_no_reset(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: # This tests if off-policy sampler respect batch_size # when no_reset is set to True env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_policy=exploration_policy, ) sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True) sampler.start_worker() runner.initialize_tf_vars() paths1 = sampler.obtain_samples(0, 5) paths2 = sampler.obtain_samples(0, 5) len1 = sum([len(path['rewards']) for path in paths1]) len2 = sum([len(path['rewards']) for path in paths2]) assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size' # yapf: disable # When done is False in 1st sampling, the next sampling should be # stacked with the last batch in 1st sampling case1 = (len(paths1[-1]['rewards']) + len(paths2[0]['rewards']) == paths2[0]['running_length']) # When done is True in 1st sampling, the next sampling should be # separated case2 = len(paths2[0]['rewards']) == paths2[0]['running_length'] done = paths1[-1]['dones'][-1] assert ( (not done and case1) or (done and case2) ), 'Running length should be the length of full path' # yapf: enable case1 = np.isclose( paths1[-1]['rewards'].sum() + paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return']) case2 = np.isclose(paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return']) assert ( (not done and case1) or (done and case2) ), 'Undiscounted_return should be the sum of rewards of full path'
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalRunner() as runner: env = TfEnv(normalize(env)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), plot=False, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') garage_logger.add_output(StdOutput()) garage_logger.add_output(CsvOutput(tabular_log_file)) garage_logger.add_output(TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=488, batch_size=2048) garage_logger.remove_all() return tabular_log_file
#!/usr/bin/env python3 import gym from garage.baselines import LinearFeatureBaseline from garage.experiment import run_experiment from garage.tf.algos import TRPO from garage.tf.envs import TfEnv from garage.tf.policies import CategoricalMLPPolicy # Need to wrap in a tf environment and force_reset to true # see https://github.com/openai/rllab/issues/87#issuecomment-282519288 env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=120, discount=0.99, max_kl_step=0.01, )
import numpy as np from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.misc.instrument import stub from garage.misc.instrument import run_experiment from garage.tf.algos import TRPO from garage.tf.policies import GaussianMLPPolicy from garage.tf.envs import TfEnv from sandbox.embed2learn.envs.mujoco import PR2ArmEnv env = TfEnv(normalize(PR2ArmEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99,