class TestReplayBuffer(unittest.TestCase): """Tests for the ReplayBuffer object.""" def setUp(self): self.replay_buffer = ReplayBuffer( buffer_size=2, batch_size=1, obs_dim=1, ac_dim=1) def tearDown(self): del self.replay_buffer def test_buffer_size(self): """Validate the buffer_size output from the replay buffer.""" self.assertEqual(self.replay_buffer.buffer_size, 2) def test_add_sample(self): """Test the `add` and `sample` methods the replay buffer.""" # Add an element. self.replay_buffer.add( obs_t=np.array([0]), action=np.array([1]), reward=2, obs_tp1=np.array([3]), done=False ) # Check is_full in the False case. self.assertEqual(self.replay_buffer.is_full(), False) # Add an element. self.replay_buffer.add( obs_t=np.array([0]), action=np.array([1]), reward=2, obs_tp1=np.array([3]), done=False ) # Check is_full in the True case. self.assertEqual(self.replay_buffer.is_full(), True) # Check can_sample in the True case. self.assertEqual(self.replay_buffer.can_sample(), True) # Test the `sample` method. obs_t, actions_t, rewards, obs_tp1, done = self.replay_buffer.sample() np.testing.assert_array_almost_equal(obs_t, [[0]]) np.testing.assert_array_almost_equal(actions_t, [[1]]) np.testing.assert_array_almost_equal(rewards, [2]) np.testing.assert_array_almost_equal(obs_tp1, [[3]]) np.testing.assert_array_almost_equal(done, [False])
def setUp(self): self.replay_buffer = ReplayBuffer(buffer_size=2, batch_size=1, obs_dim=1, ac_dim=1)
def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, layer_norm, layers, act_fun, use_huber, noise, target_policy_noise, target_noise_clip, scope=None, zero_fingerprint=False, fingerprint_dim=2): """Instantiate the feed-forward neural network policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor layer_norm : bool enable layer normalisation layers : list of int or None the size of the Neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead noise : float scaling term to the range of the action space, that is subsequently used as the standard deviation of Gaussian noise added to the action if `apply_noise` is set to True in `get_action` target_policy_noise : float standard deviation term to the noise from the output of the target actor policy. See TD3 paper for more. target_noise_clip : float clipping term for the noise injected in the target actor policy scope : str an upper-level scope term. Used by policies that call this one. zero_fingerprint : bool whether to zero the last two elements of the observations for the actor and critic computations. Used for the worker policy when fingerprints are being implemented. fingerprint_dim : int the number of fingerprint elements in the observation. Used when trying to zero the fingerprint elements. Raises ------ AssertionError if the layers is not a list of at least size 1 """ super(FeedForwardPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber) # action magnitudes ac_mag = 0.5 * (ac_space.high - ac_space.low) self.noise = noise * ac_mag self.target_policy_noise = np.array([ac_mag * target_policy_noise]) self.target_noise_clip = np.array([ac_mag * target_noise_clip]) self.zero_fingerprint = zero_fingerprint self.fingerprint_dim = fingerprint_dim assert len(self.layers) >= 1, \ "Error: must have at least one hidden layer for the policy." # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.terminals1 = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rew_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # logging of rewards to tensorboard with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar('rewards', tf.reduce_mean(self.rew_ph)) # =================================================================== # # Step 3: Create actor and critic variables. # # =================================================================== # # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): self.actor_tf = self.make_actor(self.obs_ph) self.critic_tf = [ self.make_critic(self.obs_ph, self.action_ph, scope="qf_{}".format(i)) for i in range(2) ] self.critic_with_actor_tf = [ self.make_critic(self.obs_ph, self.actor_tf, reuse=True, scope="qf_{}".format(i)) for i in range(2) ] with tf.compat.v1.variable_scope("target", reuse=False): # create the target actor policy actor_target = self.make_actor(self.obs1_ph) # smooth target policy by adding clipped noise to target actions target_noise = tf.random.normal(tf.shape(actor_target), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # clip the noisy action to remain in the bounds noisy_actor_target = tf.clip_by_value(actor_target + target_noise, self.ac_space.low, self.ac_space.high) # create the target critic policies critic_target = [ self.make_critic(self.obs1_ph, noisy_actor_target, scope="qf_{}".format(i)) for i in range(2) ] # Create the target update operations. init, soft = self._setup_target_updates('model', 'target', scope, tau, verbose) self.target_init_updates = init self.target_soft_updates = soft # =================================================================== # # Step 4: Setup the optimizers for the actor and critic. # # =================================================================== # with tf.compat.v1.variable_scope("Optimizer", reuse=False): self._setup_actor_optimizer(scope) self._setup_critic_optimizer(critic_target, scope) tf.compat.v1.summary.scalar('actor_loss', self.actor_loss) tf.compat.v1.summary.scalar('Q1_loss', self.critic_loss[0]) tf.compat.v1.summary.scalar('Q2_loss', self.critic_loss[1]) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
class FeedForwardPolicy(ActorCriticPolicy): """Feed-forward neural network actor-critic policy. Attributes ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug layers : list of int the size of the Neural network for the policy tau : float target update rate gamma : float discount factor layer_norm : bool enable layer normalisation act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead noise : float scaling term to the range of the action space, that is subsequently used as the standard deviation of Gaussian noise added to the action if `apply_noise` is set to True in `get_action` target_policy_noise : float standard deviation term to the noise from the output of the target actor policy. See TD3 paper for more. target_noise_clip : float clipping term for the noise injected in the target actor policy zero_fingerprint : bool whether to zero the last two elements of the observations for the actor and critic computations. Used for the worker policy when fingerprints are being implemented. fingerprint_dim : int the number of fingerprint elements in the observation. Used when trying to zero the fingerprint elements. replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer the replay buffer terminals1 : tf.compat.v1.placeholder placeholder for the next step terminals rew_ph : tf.compat.v1.placeholder placeholder for the rewards action_ph : tf.compat.v1.placeholder placeholder for the actions obs_ph : tf.compat.v1.placeholder placeholder for the observations obs1_ph : tf.compat.v1.placeholder placeholder for the next step observations actor_tf : tf.Variable the output from the actor network critic_tf : list of tf.Variable the output from the critic networks. Two networks are used to stabilize training. critic_with_actor_tf : list of tf.Variable the output from the critic networks with the action provided directly by the actor policy target_init_updates : tf.Operation an operation that sets the values of the trainable parameters of the target actor/critic to match those actual actor/critic target_soft_updates : tf.Operation soft target update function actor_loss : tf.Operation the operation that returns the loss of the actor actor_optimizer : tf.Operation the operation that updates the trainable parameters of the actor critic_loss : tf.Operation the operation that returns the loss of the critic critic_optimizer : tf.Operation the operation that updates the trainable parameters of the critic """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, layer_norm, layers, act_fun, use_huber, noise, target_policy_noise, target_noise_clip, scope=None, zero_fingerprint=False, fingerprint_dim=2): """Instantiate the feed-forward neural network policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor layer_norm : bool enable layer normalisation layers : list of int or None the size of the Neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead noise : float scaling term to the range of the action space, that is subsequently used as the standard deviation of Gaussian noise added to the action if `apply_noise` is set to True in `get_action` target_policy_noise : float standard deviation term to the noise from the output of the target actor policy. See TD3 paper for more. target_noise_clip : float clipping term for the noise injected in the target actor policy scope : str an upper-level scope term. Used by policies that call this one. zero_fingerprint : bool whether to zero the last two elements of the observations for the actor and critic computations. Used for the worker policy when fingerprints are being implemented. fingerprint_dim : int the number of fingerprint elements in the observation. Used when trying to zero the fingerprint elements. Raises ------ AssertionError if the layers is not a list of at least size 1 """ super(FeedForwardPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber) # action magnitudes ac_mag = 0.5 * (ac_space.high - ac_space.low) self.noise = noise * ac_mag self.target_policy_noise = np.array([ac_mag * target_policy_noise]) self.target_noise_clip = np.array([ac_mag * target_noise_clip]) self.zero_fingerprint = zero_fingerprint self.fingerprint_dim = fingerprint_dim assert len(self.layers) >= 1, \ "Error: must have at least one hidden layer for the policy." # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.terminals1 = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rew_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # logging of rewards to tensorboard with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar('rewards', tf.reduce_mean(self.rew_ph)) # =================================================================== # # Step 3: Create actor and critic variables. # # =================================================================== # # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): self.actor_tf = self.make_actor(self.obs_ph) self.critic_tf = [ self.make_critic(self.obs_ph, self.action_ph, scope="qf_{}".format(i)) for i in range(2) ] self.critic_with_actor_tf = [ self.make_critic(self.obs_ph, self.actor_tf, reuse=True, scope="qf_{}".format(i)) for i in range(2) ] with tf.compat.v1.variable_scope("target", reuse=False): # create the target actor policy actor_target = self.make_actor(self.obs1_ph) # smooth target policy by adding clipped noise to target actions target_noise = tf.random.normal(tf.shape(actor_target), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # clip the noisy action to remain in the bounds noisy_actor_target = tf.clip_by_value(actor_target + target_noise, self.ac_space.low, self.ac_space.high) # create the target critic policies critic_target = [ self.make_critic(self.obs1_ph, noisy_actor_target, scope="qf_{}".format(i)) for i in range(2) ] # Create the target update operations. init, soft = self._setup_target_updates('model', 'target', scope, tau, verbose) self.target_init_updates = init self.target_soft_updates = soft # =================================================================== # # Step 4: Setup the optimizers for the actor and critic. # # =================================================================== # with tf.compat.v1.variable_scope("Optimizer", reuse=False): self._setup_actor_optimizer(scope) self._setup_critic_optimizer(critic_target, scope) tf.compat.v1.summary.scalar('actor_loss', self.actor_loss) tf.compat.v1.summary.scalar('Q1_loss', self.critic_loss[0]) tf.compat.v1.summary.scalar('Q2_loss', self.critic_loss[1]) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model") def _setup_actor_optimizer(self, scope): """Create the actor loss, gradient, and optimizer.""" if self.verbose >= 2: print('setting up actor optimizer') scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: actor_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) print(' actor shapes: {}'.format(actor_shapes)) print(' actor params: {}'.format(actor_nb_params)) # compute the actor loss self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0]) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name)) def _setup_critic_optimizer(self, critic_target, scope): """Create the critic loss, gradient, and optimizer.""" if self.verbose >= 2: print('setting up critic optimizer') # compute the target critic term with tf.compat.v1.variable_scope("loss", reuse=False): q_obs1 = tf.minimum(critic_target[0], critic_target[1]) target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) * self.gamma * q_obs1) tf.compat.v1.summary.scalar('critic_target', tf.reduce_mean(target_q)) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf] self.critic_optimizer = [] for i, critic_loss in enumerate(self.critic_loss): scope_name = 'model/qf_{}/'.format(i) if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: critic_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] critic_nb_params = sum([ reduce(lambda x, y: x * y, shape) for shape in critic_shapes ]) print(' critic shapes: {}'.format(critic_shapes)) print(' critic params: {}'.format(critic_nb_params)) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) # create the optimizer object self.critic_optimizer.append( optimizer.minimize(loss=critic_loss, var_list=get_trainable_vars(scope_name))) def make_actor(self, obs, reuse=False, scope="pi"): """Create an actor tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the actor """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: pi_h = self._remove_fingerprint(pi_h, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0]) # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = self._layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer policy = self._layer( pi_h, self.ac_space.shape[0], 'output', act_fun=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) # scaling terms to the output from the policy ac_means = (self.ac_space.high + self.ac_space.low) / 2. ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2. policy = ac_means + ac_magnitudes * tf.to_float(policy) return policy def make_critic(self, obs, action, reuse=False, scope="qf"): """Create a critic tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the critic """ with tf.compat.v1.variable_scope(scope, reuse=reuse): # concatenate the observations and actions qf_h = tf.concat([obs, action], axis=-1) # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: qf_h = self._remove_fingerprint( qf_h, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0] + self.ac_space.shape[0]) # create the hidden layers for i, layer_size in enumerate(self.layers): qf_h = self._layer(qf_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer qvalue_fn = self._layer( qf_h, 1, 'qf_output', kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) return qvalue_fn def update(self, update_actor=True, **kwargs): """Perform a gradient update step. **Note**; The target update soft updates occur at the same frequency as the actor update frequencies. Parameters ---------- update_actor : bool specifies whether to update the actor policy. The critic policy is still updated if this value is set to False. Returns ------- [float, float] Q1 loss, Q2 loss float actor loss """ # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return [0, 0], 0 # Get a batch obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample() return self.update_from_batch(obs0, actions, rewards, obs1, terminals1, update_actor=update_actor) def update_from_batch(self, obs0, actions, rewards, obs1, terminals1, update_actor=True): """Perform gradient update step given a batch of data. Parameters ---------- obs0 : np.ndarray batch of observations actions : numpy float batch of actions executed given obs_batch rewards : numpy float rewards received as results of executing act_batch obs1 : np.ndarray next set of observations seen after executing act_batch terminals1 : numpy bool done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. update_actor : bool, optional specified whether to perform gradient update procedures to the actor policy. Default set to True. Note that the update procedure for the critic is always performed when calling this method. Returns ------- [float, float] Q1 loss, Q2 loss float actor loss """ # Reshape to match previous behavior and placeholder shape. rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) # Update operations for the critic networks. step_ops = [ self.critic_loss, self.critic_optimizer[0], self.critic_optimizer[1] ] if update_actor: # Actor updates and target soft update operation. step_ops += [ self.actor_loss, self.actor_optimizer, self.target_soft_updates ] # Perform the update operations and collect the critic loss. critic_loss, *_vals = self.sess.run(step_ops, feed_dict={ self.obs_ph: obs0, self.action_ph: actions, self.rew_ph: rewards, self.obs1_ph: obs1, self.terminals1: terminals1 }) # Extract the actor loss. actor_loss = _vals[2] if update_actor else 0 return critic_loss, actor_loss def get_action(self, obs, context, apply_noise, random_actions): """See parent class.""" # Add the contextual observation, if applicable. obs = self._get_obs(obs, context, axis=1) if random_actions: action = np.array([self.ac_space.sample()]) else: action = self.sess.run(self.actor_tf, {self.obs_ph: obs}) if apply_noise: # compute noisy action if apply_noise: action += np.random.normal(0, self.noise, action.shape) # clip by bounds action = np.clip(action, self.ac_space.low, self.ac_space.high) return action def value(self, obs, context, action): """See parent class.""" # Add the contextual observation, if applicable. obs = self._get_obs(obs, context, axis=1) return self.sess.run(self.critic_tf, feed_dict={ self.obs_ph: obs, self.action_ph: action }) def store_transition(self, obs0, context0, action, reward, obs1, context1, done, is_final_step, evaluate=False): """See parent class.""" if not evaluate: # Add the contextual observation, if applicable. obs0 = self._get_obs(obs0, context0, axis=0) obs1 = self._get_obs(obs1, context1, axis=0) # Modify the done mask in accordance with the TD3 algorithm. Done # masks that correspond to the final step are set to False. done = done and not is_final_step self.replay_buffer.add(obs0, action, reward, obs1, float(done)) def initialize(self): """See parent class. This method syncs the actor and critic optimizers across CPUs, and initializes the target parameters to match the model parameters. """ self.sess.run(self.target_init_updates) def _setup_stats(self, base="Model"): """Create the running means and std of the model inputs and outputs. This method also adds the same running means and stds as scalars to tensorboard for additional storage. """ ops = [] names = [] ops += [tf.reduce_mean(self.critic_tf[0])] names += ['{}/reference_Q1_mean'.format(base)] ops += [reduce_std(self.critic_tf[0])] names += ['{}/reference_Q1_std'.format(base)] ops += [tf.reduce_mean(self.critic_tf[1])] names += ['{}/reference_Q2_mean'.format(base)] ops += [reduce_std(self.critic_tf[1])] names += ['{}/reference_Q2_std'.format(base)] ops += [tf.reduce_mean(self.critic_with_actor_tf[0])] names += ['{}/reference_actor_Q1_mean'.format(base)] ops += [reduce_std(self.critic_with_actor_tf[0])] names += ['{}/reference_actor_Q1_std'.format(base)] ops += [tf.reduce_mean(self.critic_with_actor_tf[1])] names += ['{}/reference_actor_Q2_mean'.format(base)] ops += [reduce_std(self.critic_with_actor_tf[1])] names += ['{}/reference_actor_Q2_std'.format(base)] ops += [tf.reduce_mean(self.actor_tf)] names += ['{}/reference_action_mean'.format(base)] ops += [reduce_std(self.actor_tf)] names += ['{}/reference_action_std'.format(base)] # Add all names and ops to the tensorboard summary. for op, name in zip(ops, names): tf.compat.v1.summary.scalar(name, op) return ops, names def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample() return self.get_td_map_from_batch(obs0, actions, rewards, obs1, done1) def get_td_map_from_batch(self, obs0, actions, rewards, obs1, terminals1): """Convert a batch to a td_map.""" # Reshape to match previous behavior and placeholder shape. rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) td_map = { self.obs_ph: obs0, self.action_ph: actions, self.rew_ph: rewards, self.obs1_ph: obs1, self.terminals1: terminals1 } return td_map
class FeedForwardPolicy(ActorCriticPolicy): """SAC-compatible feedforward policy. Attributes ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead model_params : dict dictionary of model-specific parameters. See parent class. target_entropy : float target entropy used when learning the entropy coefficient replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer the replay buffer terminals1 : tf.compat.v1.placeholder placeholder for the next step terminals rew_ph : tf.compat.v1.placeholder placeholder for the rewards action_ph : tf.compat.v1.placeholder placeholder for the actions obs_ph : tf.compat.v1.placeholder placeholder for the observations obs1_ph : tf.compat.v1.placeholder placeholder for the next step observations deterministic_action : tf.Variable the output from the deterministic actor policy_out : tf.Variable the output from the stochastic actor logp_pi : tf.Variable the log-probability of a given observation given the output action from the policy logp_action : tf.Variable the log-probability of a given observation given a fixed action. Used by the hierarchical policy to perform off-policy corrections. qf1 : tf.Variable the output from the first Q-function qf2 : tf.Variable the output from the second Q-function value_fn : tf.Variable the output from the value function qf1_pi : tf.Variable the output from the first Q-function with the action provided directly by the actor policy qf2_pi : tf.Variable the output from the second Q-function with the action provided directly by the actor policy log_alpha : tf.Variable the log of the entropy coefficient alpha : tf.Variable the entropy coefficient value_target : tf.Variable the output from the target value function. Takes as input the next-step observations target_init_updates : tf.Operation an operation that sets the values of the trainable parameters of the target actor/critic to match those actual actor/critic target_soft_updates : tf.Operation soft target update function alpha_loss : tf.Operation the operation that returns the loss of the entropy term alpha_optimizer : tf.Operation the operation that updates the trainable parameters of the entropy term actor_loss : tf.Operation the operation that returns the loss of the actor actor_optimizer : tf.Operation the operation that updates the trainable parameters of the actor critic_loss : tf.Operation the operation that returns the loss of the critic critic_optimizer : tf.Operation the operation that updates the trainable parameters of the critic """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, use_huber, l2_penalty, model_params, target_entropy, scope=None, num_envs=1): """Instantiate the feed-forward neural network policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead l2_penalty : float L2 regularization penalty. This is applied to the policy network. model_params : dict dictionary of model-specific parameters. See parent class. target_entropy : float target entropy used when learning the entropy coefficient. If set to None, a heuristic value is used. scope : str an upper-level scope term. Used by policies that call this one. """ super(FeedForwardPolicy, self).__init__( sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, use_huber=use_huber, l2_penalty=l2_penalty, model_params=model_params, num_envs=num_envs, ) if target_entropy is None: self.target_entropy = -np.prod(self.ac_space.shape) else: self.target_entropy = target_entropy self._ac_means = 0.5 * (ac_space.high + ac_space.low) self._ac_magnitudes = 0.5 * (ac_space.high - ac_space.low) # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.terminals1 = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rew_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # =================================================================== # # Step 3: Create actor and critic variables. # # =================================================================== # # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): self.deterministic_action, self.policy_out, self.logp_pi, \ self.logp_action = self.make_actor(self.obs_ph, self.action_ph) self.qf1, self.qf2, self.value_fn = self.make_critic( self.obs_ph, self.action_ph, create_qf=True, create_vf=True) self.qf1_pi, self.qf2_pi, _ = self.make_critic(self.obs_ph, self.policy_out, create_qf=True, create_vf=False, reuse=True) # The entropy coefficient or entropy can be learned automatically, # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 self.log_alpha = tf.compat.v1.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) self.alpha = tf.exp(self.log_alpha) with tf.compat.v1.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.make_critic(self.obs1_ph, create_qf=False, create_vf=True) self.value_target = value_target # Create the target update operations. init, soft = self._setup_target_updates('model/value_fns/vf', 'target/value_fns/vf', scope, tau, verbose) self.target_init_updates = init self.target_soft_updates = soft # =================================================================== # # Step 4: Setup the optimizers for the actor and critic. # # =================================================================== # with tf.compat.v1.variable_scope("Optimizer", reuse=False): self._setup_actor_optimizer(scope) self._setup_critic_optimizer(scope) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model") def make_actor(self, obs, action, reuse=False, scope="pi"): """Create the actor variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the deterministic actor tf.Variable the output from the stochastic actor tf.Variable the log-probability of a given observation given the output action from the policy tf.Variable the log-probability of a given observation given a fixed action """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": pi_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: pi_h = obs # Create the model. policy_mean, log_std = create_fcnet( obs=pi_h, layers=self.model_params["layers"], num_output=self.ac_space.shape[0], stochastic=True, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac) deterministic_policy, policy, logp_pi = apply_squashing_func( policy_mean, policy, logp_pi) return deterministic_policy, policy, logp_pi, logp_ac def make_critic(self, obs, action=None, reuse=False, scope="value_fns", create_qf=True, create_vf=True): """Create the critic variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor create_qf : bool whether to create the Q-functions create_vf : bool whether to create the value function Returns ------- tf.Variable the output from the first Q-function. Set to None if `create_qf` is False. tf.Variable the output from the second Q-function. Set to None if `create_qf` is False. tf.Variable the output from the value function. Set to None if `create_vf` is False. """ conv_params = dict( image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], reuse=reuse, ) fcnet_params = dict( layers=self.model_params["layers"], num_output=1, stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], reuse=reuse, ) with tf.compat.v1.variable_scope(scope, reuse=reuse): # Value function if create_vf: if self.model_params["model_type"] == "conv": vf_h = create_conv(obs=obs, scope="vf", **conv_params) else: vf_h = obs value_fn = create_fcnet(obs=vf_h, scope="vf", output_pre="vf_", **fcnet_params) else: value_fn = None # Double Q values to reduce overestimation if create_qf: # Concatenate the observations and actions. qf_h = tf.concat([obs, action], axis=-1) if self.model_params["model_type"] == "conv": qf1_h = create_conv(obs=qf_h, scope="qf1", **conv_params) qf2_h = create_conv(obs=qf_h, scope="qf2", **conv_params) else: qf1_h = qf_h qf2_h = qf_h qf1 = create_fcnet(obs=qf1_h, scope="qf1", output_pre="qf_", **fcnet_params) qf2 = create_fcnet(obs=qf2_h, scope="qf2", output_pre="qf_", **fcnet_params) else: qf1, qf2 = None, None return qf1, qf2, value_fn def update(self, **kwargs): """Perform a gradient update step.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return # Get a batch obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample() return self.update_from_batch(obs0, actions, rewards, obs1, done1) def update_from_batch(self, obs0, actions, rewards, obs1, terminals1, update_actor=True): """Perform gradient update step given a batch of data. Parameters ---------- obs0 : array_like batch of observations actions : array_like batch of actions executed given obs_batch rewards : array_like rewards received as results of executing act_batch obs1 : array_like next set of observations seen after executing act_batch terminals1 : numpy bool done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. update_actor : bool whether to update the actor policy. Unused by this method. """ del update_actor # unused by this method # Normalize the actions (bounded between [-1, 1]). actions = (actions - self._ac_means) / self._ac_magnitudes # Reshape to match previous behavior and placeholder shape. rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) # Collect all update and loss call operations. step_ops = [ self.critic_optimizer, self.actor_optimizer, self.alpha_optimizer, self.target_soft_updates, ] # Prepare the feed_dict information. feed_dict = { self.obs_ph: obs0, self.action_ph: actions, self.rew_ph: rewards, self.obs1_ph: obs1, self.terminals1: terminals1 } # Perform the update operations. self.sess.run(step_ops, feed_dict) def get_action(self, obs, context, apply_noise, random_actions, env_num=0): """See parent class.""" # Add the contextual observation, if applicable. obs = self._get_obs(obs, context, axis=1) if random_actions: return np.array([self.ac_space.sample()]) elif apply_noise: normalized_action = self.sess.run(self.policy_out, feed_dict={self.obs_ph: obs}) return self._ac_magnitudes * normalized_action + self._ac_means else: normalized_action = self.sess.run(self.deterministic_action, feed_dict={self.obs_ph: obs}) return self._ac_magnitudes * normalized_action + self._ac_means def _setup_critic_optimizer(self, scope): """Create minimization operation for critic Q-function. Create a `tf.optimizer.minimize` operation for updating critic Q-function with gradient descent. See Equations (5, 6) in [1], for further information of the Q-function update rule. """ scope_name = 'model/value_fns' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up critic optimizer') for name in ['qf1', 'qf2', 'vf']: scope_i = '{}/{}'.format(scope_name, name) print_params_shape(scope_i, name) # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rew_ph + (1 - self.terminals1) * self.gamma * self.value_target) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error # Compute Q-Function loss qf1_loss = loss_fn(q_backup, self.qf1) qf2_loss = loss_fn(q_backup, self.qf2) # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.alpha * self.logp_pi) value_loss = loss_fn(self.value_fn, v_backup) self.critic_loss = (qf1_loss, qf2_loss, value_loss) # Combine the loss functions for the optimizer. critic_loss = qf1_loss + qf2_loss + value_loss # Critic train op critic_optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) self.critic_optimizer = critic_optimizer.minimize( critic_loss, var_list=get_trainable_vars(scope_name)) def _setup_actor_optimizer(self, scope): """Create minimization operations for policy and entropy. Creates a `tf.optimizer.minimize` operations for updating policy and entropy with gradient descent. See Section 4.2 in [1], for further information of the policy update, and Section 5 in [1] for further information of the entropy update. """ scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor and alpha optimizers') print_params_shape(scope_name, "actor") # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi) # Compute the entropy temperature loss. self.alpha_loss = -tf.reduce_mean( self.log_alpha * tf.stop_gradient(self.logp_pi + self.target_entropy)) alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.alpha_optimizer = alpha_optimizer.minimize( self.alpha_loss, var_list=self.log_alpha) # Compute the policy loss self.actor_loss = tf.reduce_mean(self.alpha * self.logp_pi - min_qf_pi) # Add a regularization penalty. self.actor_loss += self._l2_loss(self.l2_penalty, scope_name) # Policy train op (has to be separate from value train op, because # min_qf_pi appears in policy_loss) actor_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = actor_optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name)) def _setup_stats(self, base): """Create the running means and std of the model inputs and outputs. This method also adds the same running means and stds as scalars to tensorboard for additional storage. """ ops = [] names = [] ops += [tf.reduce_mean(self.qf1)] names += ['{}/reference_Q1_mean'.format(base)] ops += [reduce_std(self.qf1)] names += ['{}/reference_Q1_std'.format(base)] ops += [tf.reduce_mean(self.qf2)] names += ['{}/reference_Q2_mean'.format(base)] ops += [reduce_std(self.qf2)] names += ['{}/reference_Q2_std'.format(base)] ops += [tf.reduce_mean(self.qf1_pi)] names += ['{}/reference_actor_Q1_mean'.format(base)] ops += [reduce_std(self.qf1_pi)] names += ['{}/reference_actor_Q1_std'.format(base)] ops += [tf.reduce_mean(self.qf2_pi)] names += ['{}/reference_actor_Q2_mean'.format(base)] ops += [reduce_std(self.qf2_pi)] names += ['{}/reference_actor_Q2_std'.format(base)] ops += [ tf.reduce_mean(self._ac_magnitudes * self.policy_out + self._ac_means) ] names += ['{}/reference_action_mean'.format(base)] ops += [ reduce_std(self._ac_magnitudes * self.policy_out + self._ac_means) ] names += ['{}/reference_action_std'.format(base)] ops += [tf.reduce_mean(self.logp_pi)] names += ['{}/reference_log_probability_mean'.format(base)] ops += [reduce_std(self.logp_pi)] names += ['{}/reference_log_probability_std'.format(base)] ops += [tf.reduce_mean(self.rew_ph)] names += ['{}/rewards'.format(base)] ops += [self.alpha_loss] names += ['{}/alpha_loss'.format(base)] ops += [self.actor_loss] names += ['{}/actor_loss'.format(base)] ops += [self.critic_loss[0]] names += ['{}/Q1_loss'.format(base)] ops += [self.critic_loss[1]] names += ['{}/Q2_loss'.format(base)] tf.compat.v1.summary.scalar('Q2_loss', self.critic_loss[1]) ops += [self.critic_loss[2]] names += ['{}/value_loss'.format(base)] # Add all names and ops to the tensorboard summary. for op, name in zip(ops, names): tf.compat.v1.summary.scalar(name, op) return ops, names def initialize(self): """See parent class.""" self.sess.run(self.target_init_updates) def store_transition(self, obs0, context0, action, reward, obs1, context1, done, is_final_step, env_num=0, evaluate=False): """See parent class.""" if not evaluate: # Add the contextual observation, if applicable. obs0 = self._get_obs(obs0, context0, axis=0) obs1 = self._get_obs(obs1, context1, axis=0) self.replay_buffer.add(obs0, action, reward, obs1, float(done)) def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample() return self.get_td_map_from_batch(obs0, actions, rewards, obs1, done1) def get_td_map_from_batch(self, obs0, actions, rewards, obs1, terminals1): """Convert a batch to a td_map.""" # Reshape to match previous behavior and placeholder shape. rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) td_map = { self.obs_ph: obs0, self.action_ph: actions, self.rew_ph: rewards, self.obs1_ph: obs1, self.terminals1: terminals1 } return td_map
def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, use_huber, l2_penalty, model_params, target_entropy, scope=None, num_envs=1): """Instantiate the feed-forward neural network policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead l2_penalty : float L2 regularization penalty. This is applied to the policy network. model_params : dict dictionary of model-specific parameters. See parent class. target_entropy : float target entropy used when learning the entropy coefficient. If set to None, a heuristic value is used. scope : str an upper-level scope term. Used by policies that call this one. """ super(FeedForwardPolicy, self).__init__( sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, use_huber=use_huber, l2_penalty=l2_penalty, model_params=model_params, num_envs=num_envs, ) if target_entropy is None: self.target_entropy = -np.prod(self.ac_space.shape) else: self.target_entropy = target_entropy self._ac_means = 0.5 * (ac_space.high + ac_space.low) self._ac_magnitudes = 0.5 * (ac_space.high - ac_space.low) # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.terminals1 = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rew_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # =================================================================== # # Step 3: Create actor and critic variables. # # =================================================================== # # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): self.deterministic_action, self.policy_out, self.logp_pi, \ self.logp_action = self.make_actor(self.obs_ph, self.action_ph) self.qf1, self.qf2, self.value_fn = self.make_critic( self.obs_ph, self.action_ph, create_qf=True, create_vf=True) self.qf1_pi, self.qf2_pi, _ = self.make_critic(self.obs_ph, self.policy_out, create_qf=True, create_vf=False, reuse=True) # The entropy coefficient or entropy can be learned automatically, # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 self.log_alpha = tf.compat.v1.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) self.alpha = tf.exp(self.log_alpha) with tf.compat.v1.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.make_critic(self.obs1_ph, create_qf=False, create_vf=True) self.value_target = value_target # Create the target update operations. init, soft = self._setup_target_updates('model/value_fns/vf', 'target/value_fns/vf', scope, tau, verbose) self.target_init_updates = init self.target_soft_updates = soft # =================================================================== # # Step 4: Setup the optimizers for the actor and critic. # # =================================================================== # with tf.compat.v1.variable_scope("Optimizer", reuse=False): self._setup_actor_optimizer(scope) self._setup_critic_optimizer(scope) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, use_huber, l2_penalty, model_params, noise, target_policy_noise, target_noise_clip, scope=None, num_envs=1): """Instantiate the feed-forward neural network policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead l2_penalty : float L2 regularization penalty. This is applied to the policy network. model_params : dict dictionary of model-specific parameters. See parent class. noise : float scaling term to the range of the action space, that is subsequently used as the standard deviation of Gaussian noise added to the action if `apply_noise` is set to True in `get_action` target_policy_noise : float standard deviation term to the noise from the output of the target actor policy. See TD3 paper for more. target_noise_clip : float clipping term for the noise injected in the target actor policy scope : str an upper-level scope term. Used by policies that call this one. """ super(FeedForwardPolicy, self).__init__( sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, use_huber=use_huber, l2_penalty=l2_penalty, model_params=model_params, num_envs=num_envs, ) # action magnitudes ac_mag = 0.5 * (ac_space.high - ac_space.low) self.noise = noise * ac_mag self.target_policy_noise = np.array([ac_mag * target_policy_noise]) self.target_noise_clip = np.array([ac_mag * target_noise_clip]) # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.terminals1 = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rew_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # =================================================================== # # Step 3: Create actor and critic variables. # # =================================================================== # # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): self.actor_tf = self.make_actor(self.obs_ph) self.critic_tf = [ self.make_critic(self.obs_ph, self.action_ph, scope="qf_{}".format(i)) for i in range(2) ] self.critic_with_actor_tf = [ self.make_critic(self.obs_ph, self.actor_tf, reuse=True, scope="qf_{}".format(i)) for i in range(2) ] with tf.compat.v1.variable_scope("target", reuse=False): # create the target actor policy actor_target = self.make_actor(self.obs1_ph) # smooth target policy by adding clipped noise to target actions target_noise = tf.random.normal(tf.shape(actor_target), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # clip the noisy action to remain in the bounds noisy_actor_target = tf.clip_by_value(actor_target + target_noise, self.ac_space.low, self.ac_space.high) # create the target critic policies critic_target = [ self.make_critic(self.obs1_ph, noisy_actor_target, scope="qf_{}".format(i)) for i in range(2) ] # Create the target update operations. init, soft = self._setup_target_updates('model', 'target', scope, tau, verbose) self.target_init_updates = init self.target_soft_updates = soft # =================================================================== # # Step 4: Setup the optimizers for the actor and critic. # # =================================================================== # with tf.compat.v1.variable_scope("Optimizer", reuse=False): self._setup_actor_optimizer(scope) self._setup_critic_optimizer(critic_target, scope) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
class FeedForwardPolicy(ActorCriticPolicy): """Feed-forward neural network actor-critic policy. Attributes ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead l2_penalty : float L2 regularization penalty. This is applied to the policy network. model_params : dict dictionary of model-specific parameters. See parent class. noise : float scaling term to the range of the action space, that is subsequently used as the standard deviation of Gaussian noise added to the action if `apply_noise` is set to True in `get_action` target_policy_noise : float standard deviation term to the noise from the output of the target actor policy. See TD3 paper for more. target_noise_clip : float clipping term for the noise injected in the target actor policy replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer the replay buffer terminals1 : tf.compat.v1.placeholder placeholder for the next step terminals rew_ph : tf.compat.v1.placeholder placeholder for the rewards action_ph : tf.compat.v1.placeholder placeholder for the actions obs_ph : tf.compat.v1.placeholder placeholder for the observations obs1_ph : tf.compat.v1.placeholder placeholder for the next step observations actor_tf : tf.Variable the output from the actor network critic_tf : list of tf.Variable the output from the critic networks. Two networks are used to stabilize training. critic_with_actor_tf : list of tf.Variable the output from the critic networks with the action provided directly by the actor policy target_init_updates : tf.Operation an operation that sets the values of the trainable parameters of the target actor/critic to match those actual actor/critic target_soft_updates : tf.Operation soft target update function actor_loss : tf.Operation the operation that returns the loss of the actor actor_optimizer : tf.Operation the operation that updates the trainable parameters of the actor critic_loss : tf.Operation the operation that returns the loss of the critic critic_optimizer : tf.Operation the operation that updates the trainable parameters of the critic """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, use_huber, l2_penalty, model_params, noise, target_policy_noise, target_noise_clip, scope=None, num_envs=1): """Instantiate the feed-forward neural network policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead l2_penalty : float L2 regularization penalty. This is applied to the policy network. model_params : dict dictionary of model-specific parameters. See parent class. noise : float scaling term to the range of the action space, that is subsequently used as the standard deviation of Gaussian noise added to the action if `apply_noise` is set to True in `get_action` target_policy_noise : float standard deviation term to the noise from the output of the target actor policy. See TD3 paper for more. target_noise_clip : float clipping term for the noise injected in the target actor policy scope : str an upper-level scope term. Used by policies that call this one. """ super(FeedForwardPolicy, self).__init__( sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, use_huber=use_huber, l2_penalty=l2_penalty, model_params=model_params, num_envs=num_envs, ) # action magnitudes ac_mag = 0.5 * (ac_space.high - ac_space.low) self.noise = noise * ac_mag self.target_policy_noise = np.array([ac_mag * target_policy_noise]) self.target_noise_clip = np.array([ac_mag * target_noise_clip]) # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.terminals1 = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rew_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # =================================================================== # # Step 3: Create actor and critic variables. # # =================================================================== # # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): self.actor_tf = self.make_actor(self.obs_ph) self.critic_tf = [ self.make_critic(self.obs_ph, self.action_ph, scope="qf_{}".format(i)) for i in range(2) ] self.critic_with_actor_tf = [ self.make_critic(self.obs_ph, self.actor_tf, reuse=True, scope="qf_{}".format(i)) for i in range(2) ] with tf.compat.v1.variable_scope("target", reuse=False): # create the target actor policy actor_target = self.make_actor(self.obs1_ph) # smooth target policy by adding clipped noise to target actions target_noise = tf.random.normal(tf.shape(actor_target), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # clip the noisy action to remain in the bounds noisy_actor_target = tf.clip_by_value(actor_target + target_noise, self.ac_space.low, self.ac_space.high) # create the target critic policies critic_target = [ self.make_critic(self.obs1_ph, noisy_actor_target, scope="qf_{}".format(i)) for i in range(2) ] # Create the target update operations. init, soft = self._setup_target_updates('model', 'target', scope, tau, verbose) self.target_init_updates = init self.target_soft_updates = soft # =================================================================== # # Step 4: Setup the optimizers for the actor and critic. # # =================================================================== # with tf.compat.v1.variable_scope("Optimizer", reuse=False): self._setup_actor_optimizer(scope) self._setup_critic_optimizer(critic_target, scope) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model") def _setup_actor_optimizer(self, scope): """Create the actor loss, gradient, and optimizer.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape(scope_name, "actor") # Compute the actor loss. self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0]) # Add a regularization penalty. self.actor_loss += self._l2_loss(self.l2_penalty, scope_name) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name)) def _setup_critic_optimizer(self, critic_target, scope): """Create the critic loss, gradient, and optimizer.""" if self.verbose >= 2: print('setting up critic optimizer') # compute the target critic term with tf.compat.v1.variable_scope("loss", reuse=False): q_obs1 = tf.minimum(critic_target[0], critic_target[1]) target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) * self.gamma * q_obs1) tf.compat.v1.summary.scalar('critic_target', tf.reduce_mean(target_q)) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf] self.critic_optimizer = [] for i, critic_loss in enumerate(self.critic_loss): scope_name = 'model/qf_{}/'.format(i) if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print_params_shape(scope_name, "critic {}".format(i)) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) # create the optimizer object self.critic_optimizer.append( optimizer.minimize(loss=critic_loss, var_list=get_trainable_vars(scope_name))) def make_actor(self, obs, reuse=False, scope="pi"): """Create an actor tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the actor """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": pi_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: pi_h = obs # Create the model. policy = create_fcnet( obs=pi_h, layers=self.model_params["layers"], num_output=self.ac_space.shape[0], stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) # Scaling terms to the output from the policy. ac_means = (self.ac_space.high + self.ac_space.low) / 2. ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2. # Apply squashing and scale by action space. return ac_means + ac_magnitudes * tf.nn.tanh(policy) def make_critic(self, obs, action, reuse=False, scope="qf"): """Create a critic tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the critic """ # Concatenate the observations and actions. qf_h = tf.concat([obs, action], axis=-1) # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": qf_h = create_conv( obs=qf_h, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) return create_fcnet( obs=qf_h, layers=self.model_params["layers"], num_output=1, stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, output_pre="qf_", ) def update(self, update_actor=True, **kwargs): """Perform a gradient update step. **Note**; The target update soft updates occur at the same frequency as the actor update frequencies. Parameters ---------- update_actor : bool specifies whether to update the actor policy. The critic policy is still updated if this value is set to False. """ # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return # Get a batch obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample() return self.update_from_batch(obs0, actions, rewards, obs1, terminals1, update_actor) def update_from_batch(self, obs0, actions, rewards, obs1, terminals1, update_actor=True): """Perform gradient update step given a batch of data. Parameters ---------- obs0 : array_like batch of observations actions : array_like batch of actions executed given obs_batch rewards : array_like rewards received as results of executing act_batch obs1 : array_like next set of observations seen after executing act_batch terminals1 : numpy bool done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. update_actor : bool, optional specified whether to perform gradient update procedures to the actor policy. Default set to True. Note that the update procedure for the critic is always performed when calling this method. """ # Reshape to match previous behavior and placeholder shape. rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) # Update operations for the critic networks. step_ops = [self.critic_optimizer[0], self.critic_optimizer[1]] if update_actor: # Actor updates and target soft update operation. step_ops += [self.actor_optimizer, self.target_soft_updates] # Perform the update operations. self.sess.run(step_ops, feed_dict={ self.obs_ph: obs0, self.action_ph: actions, self.rew_ph: rewards, self.obs1_ph: obs1, self.terminals1: terminals1 }) def get_action(self, obs, context, apply_noise, random_actions, env_num=0): """See parent class.""" # Add the contextual observation, if applicable. obs = self._get_obs(obs, context, axis=1) if random_actions: action = np.array([self.ac_space.sample()]) else: action = self.sess.run(self.actor_tf, {self.obs_ph: obs}) if apply_noise: # compute noisy action if apply_noise: action += np.random.normal(0, self.noise, action.shape) # clip by bounds action = np.clip(action, self.ac_space.low, self.ac_space.high) return action def store_transition(self, obs0, context0, action, reward, obs1, context1, done, is_final_step, env_num=0, evaluate=False): """See parent class.""" if not evaluate: # Add the contextual observation, if applicable. obs0 = self._get_obs(obs0, context0, axis=0) obs1 = self._get_obs(obs1, context1, axis=0) # Modify the done mask in accordance with the TD3 algorithm. Done # masks that correspond to the final step are set to False. done = done and not is_final_step self.replay_buffer.add(obs0, action, reward, obs1, float(done)) def initialize(self): """See parent class. This method initializes the target parameters to match the model parameters. """ self.sess.run(self.target_init_updates) def _setup_stats(self, base): """Create the running means and std of the model inputs and outputs. This method also adds the same running means and stds as scalars to tensorboard for additional storage. """ ops = [] names = [] ops += [tf.reduce_mean(self.critic_tf[0])] names += ['{}/reference_Q1_mean'.format(base)] ops += [reduce_std(self.critic_tf[0])] names += ['{}/reference_Q1_std'.format(base)] ops += [tf.reduce_mean(self.critic_tf[1])] names += ['{}/reference_Q2_mean'.format(base)] ops += [reduce_std(self.critic_tf[1])] names += ['{}/reference_Q2_std'.format(base)] ops += [tf.reduce_mean(self.critic_with_actor_tf[0])] names += ['{}/reference_actor_Q1_mean'.format(base)] ops += [reduce_std(self.critic_with_actor_tf[0])] names += ['{}/reference_actor_Q1_std'.format(base)] ops += [tf.reduce_mean(self.critic_with_actor_tf[1])] names += ['{}/reference_actor_Q2_mean'.format(base)] ops += [reduce_std(self.critic_with_actor_tf[1])] names += ['{}/reference_actor_Q2_std'.format(base)] ops += [tf.reduce_mean(self.actor_tf)] names += ['{}/reference_action_mean'.format(base)] ops += [reduce_std(self.actor_tf)] names += ['{}/reference_action_std'.format(base)] ops += [tf.reduce_mean(self.rew_ph)] names += ['{}/rewards'.format(base)] ops += [self.actor_loss] names += ['{}/actor_loss'.format(base)] ops += [self.critic_loss[0]] names += ['{}/Q1_loss'.format(base)] ops += [self.critic_loss[1]] names += ['{}/Q2_loss'.format(base)] # Add all names and ops to the tensorboard summary. for op, name in zip(ops, names): tf.compat.v1.summary.scalar(name, op) return ops, names def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample() return self.get_td_map_from_batch(obs0, actions, rewards, obs1, done1) def get_td_map_from_batch(self, obs0, actions, rewards, obs1, terminals1): """Convert a batch to a td_map.""" # Reshape to match previous behavior and placeholder shape. rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) td_map = { self.obs_ph: obs0, self.action_ph: actions, self.rew_ph: rewards, self.obs1_ph: obs1, self.terminals1: terminals1 } return td_map
def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, learning_rate, verbose, layer_norm, layers, act_fun, use_huber, stochastic, scope=None): """Instantiate the policy object. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size learning_rate : float the learning rate for the policy verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug layer_norm : bool enable layer normalisation layers : list of int or None the size of the Neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss function. If set to False, the mean-squared error metric is used instead stochastic : bool specifies whether the policies are stochastic or deterministic scope : str an upper-level scope term. Used by policies that call this one. """ super(FeedForwardPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, learning_rate=learning_rate, verbose=verbose, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber, stochastic=stochastic) assert len(self.layers) >= 1, \ "Error: must have at least one hidden layer for the policy." # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') # =================================================================== # # Step 3: Create policy variables. # # =================================================================== # self.policy = None self.logp_ac = None # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): if self.stochastic: self._setup_stochastic_policy(self.obs_ph, self.action_ph) else: self._setup_deterministic_policy(self.obs_ph) # =================================================================== # # Step 4: Setup the optimizer. # # =================================================================== # self.loss = None self.optimizer = None with tf.compat.v1.variable_scope("Optimizer", reuse=False): if self.stochastic: self._setup_stochastic_optimizer(scope) else: self._setup_deterministic_optimizer(self.action_ph, scope) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
class FeedForwardPolicy(ImitationLearningPolicy): """Fully-connected neural network imitation learning policy. Attributes ---------- replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer the replay buffer action_ph : tf.compat.v1.placeholder placeholder for the actions obs_ph : tf.compat.v1.placeholder placeholder for the observations policy : tf.Variable the output from the imitation learning policy logp_ac : tf.Operation the operation that computes the log-probability of a given action. Only applies to stochastic policies. loss : tf.Operation the operation that computes the loss optimizer : tf.Operation the operation that updates the trainable parameters of the policy """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, learning_rate, verbose, layer_norm, layers, act_fun, use_huber, stochastic, scope=None): """Instantiate the policy object. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size learning_rate : float the learning rate for the policy verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug layer_norm : bool enable layer normalisation layers : list of int or None the size of the Neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss function. If set to False, the mean-squared error metric is used instead stochastic : bool specifies whether the policies are stochastic or deterministic scope : str an upper-level scope term. Used by policies that call this one. """ super(FeedForwardPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, learning_rate=learning_rate, verbose=verbose, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber, stochastic=stochastic) assert len(self.layers) >= 1, \ "Error: must have at least one hidden layer for the policy." # Compute the shape of the input observation space, which may include # the contextual term. ob_dim = self._get_ob_dim(ob_space, co_space) # =================================================================== # # Step 1: Create a replay buffer object. # # =================================================================== # self.replay_buffer = ReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space.shape[0], ) # =================================================================== # # Step 2: Create input variables. # # =================================================================== # with tf.compat.v1.variable_scope("input", reuse=False): self.action_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ac_space.shape, name='actions') self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') # =================================================================== # # Step 3: Create policy variables. # # =================================================================== # self.policy = None self.logp_ac = None # Create networks and core TF parts that are shared across setup parts. with tf.compat.v1.variable_scope("model", reuse=False): if self.stochastic: self._setup_stochastic_policy(self.obs_ph, self.action_ph) else: self._setup_deterministic_policy(self.obs_ph) # =================================================================== # # Step 4: Setup the optimizer. # # =================================================================== # self.loss = None self.optimizer = None with tf.compat.v1.variable_scope("Optimizer", reuse=False): if self.stochastic: self._setup_stochastic_optimizer(scope) else: self._setup_deterministic_optimizer(self.action_ph, scope) # =================================================================== # # Step 5: Setup the operations for computing model statistics. # # =================================================================== # # Setup the running means and standard deviations of the model inputs # and outputs. self.stats_ops, self.stats_names = self._setup_stats(scope or "Model") def _setup_stochastic_policy(self, obs, action, reuse=False, scope="pi"): """Create the variables of a stochastic policy. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the policy """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output mean policy_mean = layer( pi_h, self.ac_space.shape[0], 'mean', act_fun=None, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) # create the output log_std log_std = layer( pi_h, self.ac_space.shape[0], 'log_std', act_fun=None, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac) _, policy, _ = apply_squashing_func(policy_mean, policy, logp_pi) # Store the variables under their respective parameters. self.policy = policy self.logp_ac = logp_ac def _setup_stochastic_optimizer(self, scope): """Create the loss and optimizer of a stochastic policy.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up optimizer') print_params_shape(scope_name, "policy") # Define the loss function. self.loss = -tf.reduce_mean(self.logp_ac) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) # Create the optimizer operation. self.optimizer = optimizer.minimize( loss=self.loss, var_list=get_trainable_vars(scope_name)) def _setup_deterministic_policy(self, obs, reuse=False, scope="pi"): """Create the variables of deterministic a policy. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the policy """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer policy = layer(pi_h, self.ac_space.shape[0], 'output', act_fun=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3)) # scaling terms to the output from the policy ac_means = (self.ac_space.high + self.ac_space.low) / 2. ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2. policy = ac_means + ac_magnitudes * tf.to_float(policy) # Store the variables under their respective parameters. self.policy = policy def _setup_deterministic_optimizer(self, action, scope=None): """Create the loss and optimizer of a deterministic policy.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up optimizer') print_params_shape(scope_name, "policy") # Choose the loss function. if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error # Define the loss function. self.loss = loss_fn(action, self.policy) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) # Create the optimizer operation. self.optimizer = optimizer.minimize( loss=self.loss, var_list=get_trainable_vars(scope_name)) def _setup_stats(self, base): """Create the running means and std of the model inputs and outputs. This method also adds the same running means and stds as scalars to tensorboard for additional storage. """ ops = [] names = [] ops += [tf.reduce_mean(self.policy)] names += ['{}/reference_action_mean'.format(base)] ops += [reduce_std(self.policy)] names += ['{}/reference_action_std'.format(base)] ops += [tf.reduce_mean(self.loss)] names += ['{}/reference_loss_mean'.format(base)] ops += [reduce_std(self.loss)] names += ['{}/reference_loss_std'.format(base)] # Add all names and ops to the tensorboard summary. for op, name in zip(ops, names): tf.compat.v1.summary.scalar(name, op) return ops, names def update(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return 0 # Get a batch. obs0, actions, _, _, _ = self.replay_buffer.sample() return self.update_from_batch(obs0, actions) def update_from_batch(self, obs0, actions): """Perform gradient update step given a batch of data. Parameters ---------- obs0 : array_like batch of observations actions : array_like batch of actions executed given obs_batch Returns ------- float policy loss """ loss, *_ = self.sess.run([self.loss, self.optimizer], feed_dict={ self.obs_ph: obs0, self.action_ph: actions, }) return loss def get_action(self, obs, context): """See parent class.""" # Add the contextual observation, if applicable. obs = self._get_obs(obs, context, axis=1) # Compute the action by the policy. action = self.sess.run(self.policy, {self.obs_ph: obs}) if self.stochastic: # Scale the action by the action space of the environment. ac_means = 0.5 * (self.ac_space.high + self.ac_space.low) ac_magnitudes = 0.5 * (self.ac_space.high - self.ac_space.low) action = ac_magnitudes * action + ac_means return action def store_transition(self, obs0, context0, action, obs1, context1): """See parent class.""" # Add the contextual observation, if applicable. obs0 = self._get_obs(obs0, context0, axis=0) obs1 = self._get_obs(obs1, context1, axis=0) self.replay_buffer.add(obs0, action, 0, obs1, float(False)) def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. obs0, actions, _, _, _ = self.replay_buffer.sample() return self.get_td_map_from_batch(obs0, actions) def get_td_map_from_batch(self, obs0, actions): """Convert a batch to a td_map.""" return { self.obs_ph: obs0, self.action_ph: actions, }