def test_continuous_mlp_policy(self): continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.global_variables_initializer()) obs = self.env.observation_space.high assert continuous_mlp_policy.get_action(obs)
def setup_method(self): with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 super().setup_method() self.box_env = TfEnv(DummyBoxEnv()) self.policy1 = ContinuousMLPPolicy( env_spec=self.box_env, hidden_sizes=(32, 32), name='P1') self.policy2 = ContinuousMLPPolicy( env_spec=self.box_env, hidden_sizes=(64, 64), name='P2') self.policy3 = ContinuousMLPPolicyWithModel( env_spec=self.box_env, hidden_sizes=(32, 32), name='P3') self.policy4 = ContinuousMLPPolicyWithModel( env_spec=self.box_env, hidden_sizes=(64, 64), name='P4') self.sess.run(tf.compat.v1.global_variables_initializer()) for a, b in zip(self.policy3.get_params(), self.policy1.get_params()): self.sess.run(a.assign(b)) for a, b in zip(self.policy4.get_params(), self.policy2.get_params()): self.sess.run(a.assign(b)) self.obs = self.box_env.reset() self.action_bound = self.box_env.action_space.high assert self.policy1.vectorized == self.policy2.vectorized assert self.policy3.vectorized == self.policy4.vectorized
def test_is_pickleable(self, obs_dim, action_dim): """Test if ContinuousMLPPolicy is pickleable""" env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = ContinuousMLPPolicy(env_spec=env.spec) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, np.prod(obs_dim))) outputs = policy.build(state_input, name='policy') env.reset() obs = env.step(1).observation with tf.compat.v1.variable_scope('ContinuousMLPPolicy', reuse=True): bias = tf.compat.v1.get_variable('mlp/hidden_0/bias') # assign it to all one bias.load(tf.ones_like(bias).eval()) output1 = self.sess.run([outputs], feed_dict={state_input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, np.prod(obs_dim))) outputs = policy_pickled.build(state_input, name='policy') output2 = sess.run([outputs], feed_dict={state_input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def test_get_regularizable_vars(self, obs_dim, action_dim): """Test get_regularizable_vars method""" env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = ContinuousMLPPolicy(env_spec=env.spec) reg_vars = policy.get_regularizable_vars() assert len(reg_vars) == 2 for var in reg_vars: assert ('bias' not in var.name) and ('output' not in var.name)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=params['steps_per_epoch'], policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_ddpg_pendulum(self): """Test PPO with Pendulum environment.""" logger._tensorboard = TensorBoardOutput() env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=1e-2, n_epochs=10, n_epoch_cycles=20, max_path_length=100, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 30
def test_is_pickleable(self, obs_dim, action_dim): """Test if ContinuousMLPPolicy is pickleable""" env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'continuous_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) with tf.compat.v1.variable_scope('ContinuousMLPPolicy/MLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs, feed_dict={policy.model.input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.outputs, feed_dict={policy_pickled.model.input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e5)) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, max_path_length=100, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(gym.make('FetchReach-v1')) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e5), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def her_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow HER model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise( env_spec=env.spec, policy=policy, sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_parameters['replay_buffer_size'], time_horizon=hyper_parameters['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_rollout_steps'])
def td3_pendulum(ctxt=None, seed=1): """Wrap TD3 training task in the run_task function. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=1e-2, steps_per_epoch=20, n_train_steps=1, smooth_return=False, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=500, batch_size=250)
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('FetchReach-v1') policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) # pylint: disable=no-member replay_buffer = HERReplayBuffer(capacity_in_transitions=int(1e6), replay_k=4, reward_fn=env.compute_reward, env_spec=env.spec) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.01, steps_per_epoch=50, n_train_steps=40, discount=0.95, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=50, batch_size=256)
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with TFTrainer(snapshot_config) as trainer: n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, target_update_tau=0.005, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 200
def continuous_mlp_q_function(ctxt, env_id, seed): """Create Continuous MLP QFunction on TF-DDPG. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise( env.spec, policy, sigma=hyper_params['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = PathBuffer( capacity_in_transitions=hyper_params['replay_buffer_size']) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=hyper_params['steps_per_epoch'], policy_lr=hyper_params['policy_lr'], qf_lr=hyper_params['qf_lr'], target_update_tau=hyper_params['tau'], n_train_steps=hyper_params['n_train_steps'], discount=hyper_params['discount'], min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(ddpg, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_exploration_steps'])
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return: """ ext.set_seed(seed) with tf.Graph().as_default(): # Set up params for ddpg action_noise = OUStrategy(env, sigma=params["sigma"]) actor_net = ContinuousMLPPolicy( env_spec=env, name="Actor", hidden_sizes=params["actor_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=params["critic_hidden_sizes"], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG(env, actor=actor_net, critic=critic_net, actor_lr=params["actor_lr"], critic_lr=params["critic_lr"], plot=False, target_update_tau=params["tau"], n_epochs=params["n_epochs"], n_epoch_cycles=params["n_epoch_cycles"], n_rollout_steps=params["n_rollout_steps"], n_train_steps=params["n_train_steps"], discount=params["discount"], replay_buffer_size=params["replay_buffer_size"], min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") tensorboard_log_dir = osp.join(log_dir, "progress") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(tensorboard_log_dir) ddpg.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file
def test_get_action(self, obs_dim, action_dim): """Test get_action method""" env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env_name = "HalfCheetah-v3" #env_name = "Swimmer-v2" #env_name = "InvertedDoublePendulum-v2" env = TfEnv(gym.make(env_name)) action_noise = OUStrategy(env.spec, sigma=0.2) #action_noise = GaussianStrategy(env.spec) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) reward_model = ContinuousMLPRewardFunction( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, action_merge_layer=0) obs_model = ContinuousMLPObsFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, action_merge_layer=0) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) jole_ddpg = JoLeDDPG(env_spec=env.spec, env_name=env_name, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, reward_model=reward_model, obs_model=obs_model, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, lambda_ratio=0.01, jole_obs_action_type="random_sample") runner.setup(algo=jole_ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def test_build(self, obs_dim, action_dim): """Test build method""" env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs = env.step(1).observation obs_dim = env.spec.observation_space.flat_dim state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim)) action_sym = policy.build(state_input, name='action_sym') action = self.sess.run(action_sym, feed_dict={state_input: [obs.flatten()]}) action = policy.action_space.unflatten(action) assert env.action_space.contains(action)
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=0.05, n_epochs=50, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) ddpg.train()
def run_task(snapshot_config, *_): """Wrap TD3 training task in the run_task function. Args: snapshot_config (garage.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=1e-2, steps_per_epoch=20, n_train_steps=1, smooth_return=False, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=500, batch_size=250)
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=0.005, n_epoch_cycles=20, n_train_steps=50, discount=0.99, smooth_return=False, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=250) assert last_avg_ret > 400
def continuous_mlp_q_function(ctxt, env_id, seed): """Create Continuous MLP QFunction on TF-DDPG. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise( env.spec, policy, sigma=hyper_params['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_params['replay_buffer_size'], time_horizon=hyper_params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_params['steps_per_epoch'], policy_lr=hyper_params['policy_lr'], qf_lr=hyper_params['qf_lr'], target_update_tau=hyper_params['tau'], n_train_steps=hyper_params['n_train_steps'], discount=hyper_params['discount'], min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(ddpg, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def ddpg_pendulum(ctxt=None, seed=1): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=500, batch_size=100)
def test_no_reset(self): with LocalRunner(self.sess) as runner: # This tests if off-policy sampler respect batch_size # when no_reset is set to True env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True) sampler.start_worker() runner.initialize_tf_vars() paths1 = sampler.obtain_samples(0, 5) paths2 = sampler.obtain_samples(0, 5) len1 = sum([len(path['rewards']) for path in paths1]) len2 = sum([len(path['rewards']) for path in paths2]) assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size' assert (len(paths1[0]['rewards']) + len(paths2[0]['rewards']) == paths2[0]['running_length']), \ 'Running length should be the length of full path' assert np.isclose( paths1[0]['rewards'].sum() + paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return'] ), 'Undiscounted_return should be the sum of rewards of full path'
def test_get_action(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'continuous_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs) expected_action = np.full(action_dim, 0.5) assert env.action_space.contains(action) assert np.array_equal(action, expected_action) actions, _ = policy.get_actions([obs, obs, obs]) for action in actions: assert env.action_space.contains(action) assert np.array_equal(action, expected_action)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('HalfCheetah-v3')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) reward_model = ContinuousMLPRewardFunction( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) obs_model = ContinuousMLPObsFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ima_replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) dyna_ddpg = DynaDDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, reward_model=reward_model, obs_model=obs_model, replay_buffer=replay_buffer, ima_replay_buffer=ima_replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo=dyna_ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, n_epoch_cycles=20)
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = gym.make('FetchReach-v1') action_noise = OUStrategy(env, sigma=0.2) actor_net = ContinuousMLPPolicy( env_spec=env, name="Actor", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) ddpg = DDPG( env, actor=actor_net, actor_lr=1e-3, critic_lr=1e-3, critic=critic_net, plot=False, target_update_tau=0.05, n_epochs=50, n_epoch_cycles=20, n_rollout_steps=100, n_train_steps=40, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer, use_her=True, batch_size=256, clip_obs=200., ) ddpg.train()