def setUp(self, mock_rand): mock_rand.return_value = 0.5 super().setUp() self.obs_dim = (5, ) self.act_dim = (2, ) self.box_env = TfEnv( DummyBoxEnv(obs_dim=self.obs_dim, action_dim=self.act_dim)) self.qf1 = ContinuousMLPQFunction(env_spec=self.box_env, hidden_sizes=(32, 32), name='QF1') self.qf2 = ContinuousMLPQFunction(env_spec=self.box_env, hidden_sizes=(64, 64), name='QF2') self.qf3 = ContinuousMLPQFunctionWithModel(env_spec=self.box_env, hidden_sizes=(32, 32), name='QF3') self.qf4 = ContinuousMLPQFunctionWithModel(env_spec=self.box_env, hidden_sizes=(64, 64), name='QF4') self.sess.run(tf.global_variables_initializer()) for a, b in zip(self.qf3.get_trainable_vars(), self.qf1.get_trainable_vars()): self.sess.run(a.assign(b)) for a, b in zip(self.qf4.get_trainable_vars(), self.qf2.get_trainable_vars()): self.sess.run(a.assign(b)) self.obs = self.box_env.reset() self.act = np.full((2, ), 0.5)
def test_build(self, obs_dim, action_dim): env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) qf = ContinuousMLPQFunction(env_spec=env.spec, action_merge_layer=0, hidden_sizes=(1, ), hidden_nonlinearity=None, hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer()) obs = np.full(obs_dim, 1).flatten() act = np.full(action_dim, 1).flatten() output1 = qf.get_qval([obs], [act]) input_var1 = tf.compat.v1.placeholder(tf.float32, shape=(None, obs.shape[0])) input_var2 = tf.compat.v1.placeholder(tf.float32, shape=(None, act.shape[0])) q_vals = qf.build(input_var1, input_var2, 'another') output2 = self.sess.run(q_vals, feed_dict={ input_var1: [obs], input_var2: [act] }) expected_output = np.full((1, ), np.prod(obs_dim) + np.prod(action_dim)) assert np.array_equal(output1, output2) assert np.array_equal(output2[0], expected_output)
def td3_pendulum(ctxt=None, seed=1): """Wrap TD3 training task in the run_task function. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=1e-2, steps_per_epoch=20, n_train_steps=1, smooth_return=False, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=500, batch_size=250)
def test_is_pickleable(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() with tf.compat.v1.variable_scope( 'ContinuousMLPQFunction/SimpleMLPMergeModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = qf.get_qval([obs], [act]) h_data = pickle.dumps(qf) with tf.compat.v1.Session(graph=tf.Graph()): qf_pickled = pickle.loads(h_data) output2 = qf_pickled.get_qval([obs], [act]) assert np.array_equal(output1, output2)
def test_get_qval_sym(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() output1 = qf.get_qval([obs], [act]) input_var1 = tf.compat.v1.placeholder(tf.float32, shape=(None, obs.shape[0])) input_var2 = tf.compat.v1.placeholder(tf.float32, shape=(None, act.shape[0])) q_vals = qf.get_qval_sym(input_var1, input_var2, 'another') output2 = self.sess.run(q_vals, feed_dict={ input_var1: [obs], input_var2: [act] }) expected_output = np.full((1, ), 0.5) assert np.array_equal(output1, output2) assert np.array_equal(output2[0], expected_output)
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with TFTrainer(snapshot_config) as trainer: n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, target_update_tau=0.005, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 200
def test_clone(self, obs_dim, action_dim, hidden_sizes): env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=hidden_sizes) qf_clone = qf.clone('another_qf') assert qf_clone._hidden_sizes == qf._hidden_sizes for cloned_param, param in zip(qf_clone.parameters.values(), qf.parameters.values()): assert np.array_equal(cloned_param, param)
def test_clone(self, obs_dim, action_dim, hidden_sizes): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=hidden_sizes) qf_clone = qf.clone('another_qf') assert qf_clone._hidden_sizes == qf._hidden_sizes
def test_output_shape(self, obs_dim, action_dim): env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs = env.step(1).observation obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() outputs = qf.get_qval([obs], [act]) assert outputs.shape == (1, 1)
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=0.005, n_epoch_cycles=20, n_train_steps=50, discount=0.99, smooth_return=False, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=250) assert last_avg_ret > 400
def run_task(snapshot_config, *_): """Wrap TD3 training task in the run_task function. Args: snapshot_config (garage.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=1e-2, steps_per_epoch=20, n_train_steps=1, smooth_return=False, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=500, batch_size=250)
def test_output_shape(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() outputs = qf.get_qval([obs], [act]) assert outputs.shape == (1, 1)
def test_ddpg_pendulum(self): """Test PPO with Pendulum environment.""" logger._tensorboard = TensorBoardOutput() env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=1e-2, n_epochs=10, n_epoch_cycles=20, max_path_length=100, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 30
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=params['steps_per_epoch'], policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def run_task(*_): env = normalize( OneHotMultiTaskEnv( task_env_cls=PR2ArmClockEnv, task_args=TASK_ARGS, task_kwargs=TASK_KWARGS)) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=4000, min_pool_size=10000, n_epochs=1000000000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def her_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow HER model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise( env_spec=env.spec, policy=policy, sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_parameters['replay_buffer_size'], time_horizon=hyper_parameters['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_rollout_steps'])
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(gym.make('FetchReach-v1')) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e5), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e5)) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, max_path_length=100, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def continuous_mlp_q_function(ctxt, env_id, seed): """Create Continuous MLP QFunction on TF-DDPG. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise( env.spec, policy, sigma=hyper_params['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = PathBuffer( capacity_in_transitions=hyper_params['replay_buffer_size']) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=hyper_params['steps_per_epoch'], policy_lr=hyper_params['policy_lr'], qf_lr=hyper_params['qf_lr'], target_update_tau=hyper_params['tau'], n_train_steps=hyper_params['n_train_steps'], discount=hyper_params['discount'], min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(ddpg, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_exploration_steps'])
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return: """ ext.set_seed(seed) with tf.Graph().as_default(): # Set up params for ddpg action_noise = OUStrategy(env, sigma=params["sigma"]) actor_net = ContinuousMLPPolicy( env_spec=env, name="Actor", hidden_sizes=params["actor_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=params["critic_hidden_sizes"], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG(env, actor=actor_net, critic=critic_net, actor_lr=params["actor_lr"], critic_lr=params["critic_lr"], plot=False, target_update_tau=params["tau"], n_epochs=params["n_epochs"], n_epoch_cycles=params["n_epoch_cycles"], n_rollout_steps=params["n_rollout_steps"], n_train_steps=params["n_train_steps"], discount=params["discount"], replay_buffer_size=params["replay_buffer_size"], min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") tensorboard_log_dir = osp.join(log_dir, "progress") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(tensorboard_log_dir) ddpg.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('FetchReach-v1') policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) # pylint: disable=no-member replay_buffer = HERReplayBuffer(capacity_in_transitions=int(1e6), replay_k=4, reward_fn=env.compute_reward, env_spec=env.spec) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.01, steps_per_epoch=50, n_train_steps=40, discount=0.95, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=50, batch_size=256)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env_name = "HalfCheetah-v3" #env_name = "Swimmer-v2" #env_name = "InvertedDoublePendulum-v2" env = TfEnv(gym.make(env_name)) action_noise = OUStrategy(env.spec, sigma=0.2) #action_noise = GaussianStrategy(env.spec) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) reward_model = ContinuousMLPRewardFunction( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, action_merge_layer=0) obs_model = ContinuousMLPObsFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, action_merge_layer=0) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) jole_ddpg = JoLeDDPG(env_spec=env.spec, env_name=env_name, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, reward_model=reward_model, obs_model=obs_model, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, lambda_ratio=0.01, jole_obs_action_type="random_sample") runner.setup(algo=jole_ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def test_no_reset(self): with LocalRunner(sess=self.sess) as runner: # This tests if off-policy sampler respect batch_size # when no_reset is set to True env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True) sampler.start_worker() runner.initialize_tf_vars() paths1 = sampler.obtain_samples(0, 5) paths2 = sampler.obtain_samples(0, 5) len1 = sum([len(path['rewards']) for path in paths1]) len2 = sum([len(path['rewards']) for path in paths2]) assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size' # yapf: disable assert (len(paths1[0]['rewards']) + len(paths2[0]['rewards']) == paths2[0]['running_length']), ( 'Running length should be the length of full path') # yapf: enable assert np.isclose( paths1[0]['rewards'].sum() + paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return'] ), 'Undiscounted_return should be the sum of rewards of full path'
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=0.05, n_epochs=50, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) ddpg.train()
def continuous_mlp_q_function(ctxt, env_id, seed): """Create Continuous MLP QFunction on TF-DDPG. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise( env.spec, policy, sigma=hyper_params['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_params['replay_buffer_size'], time_horizon=hyper_params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_params['steps_per_epoch'], policy_lr=hyper_params['policy_lr'], qf_lr=hyper_params['qf_lr'], target_update_tau=hyper_params['tau'], n_train_steps=hyper_params['n_train_steps'], discount=hyper_params['discount'], min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(ddpg, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def test_q_vals(self, obs_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() expected_output = np.full((1, ), 0.5) outputs = qf.get_qval([obs], [act]) assert np.array_equal(outputs[0], expected_output) outputs = qf.get_qval([obs, obs, obs], [act, act, act]) for output in outputs: assert np.array_equal(output, expected_output)
def ddpg_pendulum(ctxt=None, seed=1): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=500, batch_size=100)
def test_q_vals_goal_conditioned(self): env = GarageEnv(DummyDictEnv()) with mock.patch(('garage.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = np.concatenate( (obs['observation'], obs['desired_goal'], obs['achieved_goal']), axis=-1) act = np.full((1, ), 0.5).flatten() expected_output = np.full((1, ), 0.5) outputs = qf.get_qval([obs], [act]) assert np.array_equal(outputs[0], expected_output) outputs = qf.get_qval([obs, obs, obs], [act, act, act]) for output in outputs: assert np.array_equal(output, expected_output)