def setup_method(self): self.env = DummyDictEnv() self.obs = self.env.reset() self.replay_buffer = HerReplayBuffer( env_spec=self.env.spec, size_in_transitions=3, time_horizon=1, replay_k=0.4, reward_fun=self.env.compute_reward)
def setup_method(self): self.env = DummyDictEnv() obs = self.env.reset() self.replay_buffer = HerReplayBuffer( env_spec=self.env.spec, size_in_transitions=3, time_horizon=1, replay_k=0.4, reward_fun=self.env.compute_reward) # process observations self.d_g = obs['desired_goal'] self.a_g = obs['achieved_goal'] self.obs = obs['observation']
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(gym.make('FetchReach-v1')) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def her_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow HER model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) exploration_policy = AddOrnsteinUhlenbeckNoise( env_spec=env.spec, policy=policy, sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_parameters['replay_buffer_size'], time_horizon=hyper_parameters['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_rollout_steps'])
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=0.05, n_epochs=50, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) ddpg.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, n_epoch_cycles=20)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) env.reset() with LocalRunner() as runner: env = TfEnv(normalize(env)) action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') logger.add_output(dowel.StdOutput()) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) logger.remove_all() return tabular_log_file
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: """ deterministic.set_seed(seed) env.reset() with LocalRunner() as runner: env = TfEnv(env) action_noise = OUStrategy(env.spec, sigma=params["sigma"]) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params["policy_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params["qf_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params["replay_buffer_size"], time_horizon=params["n_rollout_steps"], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params["policy_lr"], qf_lr=params["qf_lr"], plot=False, target_update_tau=params["tau"], n_epochs=params["n_epochs"], n_epoch_cycles=params["n_epoch_cycles"], n_train_steps=params["n_train_steps"], discount=params["discount"], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(log_dir) runner.setup(algo, env) runner.train( n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params["n_rollout_steps"]) garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file
class TestHerReplayBuffer: def setup_method(self): self.env = DummyDictEnv() self.obs = self.env.reset() self.replay_buffer = HerReplayBuffer( env_spec=self.env.spec, size_in_transitions=3, time_horizon=1, replay_k=0.4, reward_fun=self.env.compute_reward) def _add_single_transition(self): self.replay_buffer.add_transition( observation=self.obs, action=self.env.action_space.sample(), terminal=False, next_observation=self.obs) def _add_transitions(self): self.replay_buffer.add_transitions( observation=[self.obs], action=[self.env.action_space.sample()], terminal=[False], next_observation=[self.obs]) def test_add_transition_dtype(self): self._add_single_transition() sample = self.replay_buffer.sample(1) assert sample['observation'].dtype == self.env.observation_space[ 'observation'].dtype assert sample['achieved_goal'].dtype == self.env.observation_space[ 'achieved_goal'].dtype assert sample['goal'].dtype == self.env.observation_space[ 'desired_goal'].dtype assert sample['action'].dtype == self.env.action_space.dtype def test_add_transitions_dtype(self): self._add_transitions() sample = self.replay_buffer.sample(1) assert sample['observation'].dtype == self.env.observation_space[ 'observation'].dtype assert sample['achieved_goal'].dtype == self.env.observation_space[ 'achieved_goal'].dtype assert sample['goal'].dtype == self.env.observation_space[ 'desired_goal'].dtype assert sample['action'].dtype == self.env.action_space.dtype def test_eviction_policy(self): self.replay_buffer.add_transitions( observation=[self.obs, self.obs], next_observation=[self.obs, self.obs], terminal=[False, False], action=[1, 2]) assert not self.replay_buffer.full self.replay_buffer.add_transitions( observation=[self.obs, self.obs], next_observation=[self.obs, self.obs], terminal=[False, False], action=[3, 4]) assert self.replay_buffer.full self.replay_buffer.add_transitions( observation=[self.obs, self.obs], next_observation=[self.obs, self.obs], terminal=[False, False], action=[5, 6]) self.replay_buffer.add_transitions( observation=[self.obs, self.obs], next_observation=[self.obs, self.obs], terminal=[False, False], action=[7, 8]) assert np.array_equal(self.replay_buffer._buffer['action'], [[7], [8], [6]]) assert self.replay_buffer.n_transitions_stored == 3 def test_pickleable(self): self._add_transitions() replay_buffer_pickled = pickle.loads(pickle.dumps(self.replay_buffer)) assert replay_buffer_pickled._buffer.keys( ) == self.replay_buffer._buffer.keys() for k in replay_buffer_pickled._buffer: assert replay_buffer_pickled._buffer[ k].shape == self.replay_buffer._buffer[k].shape sample = self.replay_buffer.sample(1) sample2 = replay_buffer_pickled.sample(1) for k in self.replay_buffer._buffer: assert sample[k].shape == sample2[k].shape
def _initialize(self): with tf.name_scope(self.name, "DDPG"): with tf.name_scope("setup_networks"): """Set up the actor, critic and target network.""" # Set up the actor and critic network self.actor._build_net(trainable=True) self.critic._build_net(trainable=True) # Create target actor and critic network target_actor = copy(self.actor) target_critic = copy(self.critic) # Set up the target network target_actor.name = "TargetActor" target_actor._build_net(trainable=False) target_critic.name = "TargetCritic" target_critic._build_net(trainable=False) input_shapes = dims_to_shapes(self.input_dims) # Initialize replay buffer if self.use_her: buffer_shapes = { key: (self.n_rollout_steps + 1 if key == "observation" or key == "achieved_goal" else self.n_rollout_steps, *input_shapes[key]) for key, val in input_shapes.items() } replay_buffer = HerReplayBuffer( buffer_shapes=buffer_shapes, size_in_transitions=self.replay_buffer_size, time_horizon=self.n_rollout_steps, sample_transitions=make_her_sample( self.replay_k, self.env.compute_reward)) else: replay_buffer = ReplayBuffer( buffer_shapes=input_shapes, max_buffer_size=self.replay_buffer_size) # Set up target init and update function with tf.name_scope("setup_target"): actor_init_ops, actor_update_ops = get_target_ops( self.actor.global_vars, target_actor.global_vars, self.tau) critic_init_ops, critic_update_ops = get_target_ops( self.critic.global_vars, target_critic.global_vars, self.tau) target_init_op = actor_init_ops + critic_init_ops target_update_op = actor_update_ops + critic_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope("inputs"): obs_dim = ( self.input_dims["observation"] + self.input_dims["goal"] ) if self.use_her else self.input_dims["observation"] y = tf.placeholder(tf.float32, shape=(None, 1), name="input_y") obs = tf.placeholder( tf.float32, shape=(None, obs_dim), name="input_observation") actions = tf.placeholder( tf.float32, shape=(None, self.input_dims["action"]), name="input_action") # Set up actor training function next_action = self.actor.get_action_sym(obs, name="actor_action") next_qval = self.critic.get_qval_sym( obs, next_action, name="actor_qval") with tf.name_scope("action_loss"): action_loss = -tf.reduce_mean(next_qval) if self.actor_weight_decay > 0.: actor_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.actor_weight_decay), weights_list=self.actor.regularizable_vars) action_loss += actor_reg with tf.name_scope("minimize_action_loss"): actor_train_op = self.actor_optimizer( self.actor_lr, name="ActorOptimizer").minimize( action_loss, var_list=self.actor.trainable_vars) f_train_actor = tensor_utils.compile_function( inputs=[obs], outputs=[actor_train_op, action_loss]) # Set up critic training function qval = self.critic.get_qval_sym(obs, actions, name="q_value") with tf.name_scope("qval_loss"): qval_loss = tf.reduce_mean(tf.squared_difference(y, qval)) if self.critic_weight_decay > 0.: critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_weight_decay), weights_list=self.critic.regularizable_vars) qval_loss += critic_reg with tf.name_scope("minimize_critic_loss"): critic_train_op = self.critic_optimizer( self.critic_lr, name="CriticOptimizer").minimize( qval_loss, var_list=self.critic.trainable_vars) f_train_critic = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[critic_train_op, qval_loss, qval]) self.f_train_actor = f_train_actor self.f_train_critic = f_train_critic self.f_init_target = f_init_target self.f_update_target = f_update_target self.replay_buffer = replay_buffer self.target_critic = target_critic self.target_actor = target_actor