def actor_loss(self, time_steps): with tf.compat.v1.name_scope('actor_loss'): actions, _ = self._actor_network(time_steps.observation, time_steps.step_type) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(actions) avg_expected_q_values = self._get_state_values( time_steps, actions, aggregate='mean') actions = tf.nest.flatten(actions) dqdas = tape.gradient([avg_expected_q_values], actions) actor_losses = [] for dqda, action in zip(dqdas, actions): # 其实就只有一个 loss = common.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) loss = tf.reduce_sum(input_tensor=loss, axis=1) loss = tf.reduce_mean(input_tensor=loss) actor_losses.append(loss) actor_loss = tf.add_n(actor_losses) with tf.compat.v1.name_scope('Losses/'): tf.compat.v2.summary.scalar(name='actor_loss', data=actor_loss, step=self.train_step_counter) return actor_loss
def actor_loss(self, time_steps): """Computes the actor_loss for DDPG training. Args: time_steps: A batch of timesteps. # TODO(kbanoop): Add an action norm regularizer. Returns: actor_loss: A scalar actor loss. """ with tf.name_scope('actor_loss'): actions, _ = self._actor_network(time_steps.observation, time_steps.step_type) q_values, _ = self._critic_network(time_steps.observation, actions, time_steps.step_type) actions = nest.flatten(actions) dqda = tf.gradients([q_values], actions) actor_losses = [] for dqda, action in zip(dqda, actions): if self._dqda_clipping is not None: dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping, self._dqda_clipping) loss = common_utils.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) if nest_utils.is_batched_nested_tensors( time_steps, self.time_step_spec(), num_outer_dims=2): # Sum over the time dimension. loss = tf.reduce_sum(loss, axis=1) loss = tf.reduce_mean(loss) actor_losses.append(loss) actor_loss = tf.add_n(actor_losses) with tf.name_scope('Losses/'): tf.contrib.summary.scalar('actor_loss', actor_loss) return actor_loss
def actor_loss(self, time_steps: ts.TimeStep, weights: Optional[types.Tensor] = None, training: bool = False) -> types.Tensor: """Computes the actor_loss for DDPG training. Args: time_steps: A batch of timesteps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. training: Whether this loss is being used for training. # TODO(b/124383618): Add an action norm regularizer. Returns: actor_loss: A scalar actor loss. """ with tf.name_scope('actor_loss'): actions, _ = self._actor_network(time_steps.observation, time_steps.step_type, training=training) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(actions) q_values, _ = self._critic_network( (time_steps.observation, actions), time_steps.step_type, training=False) actions = tf.nest.flatten(actions) dqdas = tape.gradient([q_values], actions) actor_losses = [] for dqda, action in zip(dqdas, actions): if self._dqda_clipping is not None: dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping, self._dqda_clipping) loss = common.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec, num_outer_dims=2): # Sum over the time dimension. loss = tf.reduce_sum(loss, axis=1) if weights is not None: loss *= weights loss = tf.reduce_mean(loss) actor_losses.append(loss) actor_loss = tf.add_n(actor_losses) with tf.name_scope('Losses/'): tf.compat.v2.summary.scalar(name='actor_loss', data=actor_loss, step=self.train_step_counter) return actor_loss
def actor_loss(self, time_steps, weights=None): """Computes the actor_loss for TD3 training. Args: time_steps: A batch of timesteps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Returns: actor_loss: A scalar actor loss. """ with tf.name_scope('actor_loss'): actions, _ = self._actor_network(time_steps.observation, time_steps.step_type) critic_network_input = (time_steps.observation, actions) q_values, _ = self._critic_network_1(critic_network_input, time_steps.step_type) actions = nest.flatten(actions) dqda = tf.gradients([q_values], actions) actor_losses = [] for dqda, action in zip(dqda, actions): if self._dqda_clipping is not None: # pylint: disable=invalid-unary-operand-type dqda = tf.clip_by_value(dqda, -self._dqda_clipping, self._dqda_clipping) loss = common_utils.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec(), num_outer_dims=2): # Sum over the time dimension. loss = tf.reduce_sum(loss, axis=1) if weights is not None: loss *= weights loss = tf.reduce_mean(loss) actor_losses.append(loss) # TODO(kbanoop): Add an action norm regularizer. return tf.add_n(actor_losses)
def actor_loss(self, time_steps, weights=None): """Computes the actor_loss for DDPG training. Args: time_steps: A batch of timesteps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. # TODO(kbanoop): Add an action norm regularizer. Returns: actor_loss: A scalar actor loss. """ with tf.name_scope('actor_loss'): actions, _ = self._actor_network(time_steps.observation, time_steps.step_type) critic_net_input = (time_steps.observation, actions) q_values, _ = self._critic_network(critic_net_input, time_steps.step_type) actions = tf.nest.flatten(actions) dqdas = tf.gradients(ys=[q_values], xs=actions) actor_losses = [] for dqda, action in zip(dqdas, actions): if self._dqda_clipping is not None: dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping, self._dqda_clipping) loss = common_utils.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec, num_outer_dims=2): # Sum over the time dimension. loss = tf.reduce_sum(input_tensor=loss, axis=1) if weights is not None: loss *= weights loss = tf.reduce_mean(input_tensor=loss) actor_losses.append(loss) actor_loss = tf.add_n(actor_losses) with tf.name_scope('Losses/'): tf.compat.v2.summary.scalar(name='actor_loss', data=actor_loss, step=self.train_step_counter) return actor_loss
def actor_loss(self, time_steps, total_actions, index, weights=None, training=False): """Computes the actor_loss for DDPG training. Args: time_steps: A batch of timesteps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. training: Whether this loss is being used for training. # TODO(b/124383618): Add an action norm regularizer. Returns: actor_loss: A scalar actor loss. """ with tf.name_scope('actor_loss'): action_current_agent, _ = self._actor_network( time_steps.observation[index], time_steps.step_type, training=training) total_actions = list(total_actions) main_actions = [] for i, action in enumerate(total_actions): if i == index: main_actions.append(action_current_agent) else: main_actions.append(action) main_actions = tuple(main_actions) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(main_actions) q_values, _ = self._critic_network( (time_steps.observation, main_actions), time_steps.step_type, training=training) main_actions = tf.nest.flatten(main_actions[index]) dqdas = tape.gradient([q_values], main_actions) actor_losses = [] for dqda, action in zip(dqdas, main_actions): if self._dqda_clipping is not None: dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping, self._dqda_clipping) loss = common.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec, num_outer_dims=2): # Sum over the time dimension. loss = tf.reduce_sum(loss, axis=1) if weights is not None: loss *= weights loss = tf.reduce_mean(loss) actor_losses.append(loss) actor_loss = tf.add_n(actor_losses) with tf.name_scope('Losses/'): tf.compat.v2.summary.scalar(name='actor_loss', data=actor_loss, step=self.train_step_counter) return actor_loss # def actor_loss(self, time_steps, weights=None, training=False): # """Computes the actor_loss for DDPG training. # # Args: # time_steps: A batch of timesteps. # weights: Optional scalar or element-wise (per-batch-entry) importance # weights. # training: Whether this loss is being used for training. # # TODO(b/124383618): Add an action norm regularizer. # Returns: # actor_loss: A scalar actor loss. # """ # with tf.name_scope('actor_loss'): # actions, _ = self._actor_network(time_steps.observation, # time_steps.step_type, # training=training) # with tf.GradientTape(watch_accessed_variables=False) as tape: # tape.watch(actions) # q_values, _ = self._critic_network((time_steps.observation, actions), # time_steps.step_type, # training=False) # actions = tf.nest.flatten(actions) # # dqdas = tape.gradient([q_values], actions) # # actor_losses = [] # for dqda, action in zip(dqdas, actions): # if self._dqda_clipping is not None: # dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping, # self._dqda_clipping) # loss = common.element_wise_squared_loss( # tf.stop_gradient(dqda + action), action) # if nest_utils.is_batched_nested_tensors( # time_steps, self.time_step_spec, num_outer_dims=2): # # Sum over the time dimension. # loss = tf.reduce_sum(loss, axis=1) # if weights is not None: # loss *= weights # loss = tf.reduce_mean(loss) # actor_losses.append(loss) # # actor_loss = tf.add_n(actor_losses) # # with tf.name_scope('Losses/'): # tf.compat.v2.summary.scalar( # name='actor_loss', data=actor_loss, step=self.train_step_counter) # # return actor_loss
def actor_loss(self, time_steps, discrete_actions, weights=None): """Computes the actor_loss for TD3 training. Args: time_steps: A batch of timesteps. discrete_actions: A tensor of discrete action arguments. weights: Optional scalar or element-wise (per-batch-entry) importance weights. # TODO(b/124383618): Add an action norm regularizer. Returns: actor_loss: A scalar actor loss. """ with tf.name_scope('actor_loss'): with tf.GradientTape(watch_accessed_variables=True) as tape: q_values, continuous_actions = self._compute_q_values( self._q_network_1, time_steps, discrete_actions) continuous_actions = tf.nest.flatten(continuous_actions) tape.watch(continuous_actions) dqdas = tape.gradient([q_values], continuous_actions) actor_losses = [] for dqda, cont_action in zip(dqdas, continuous_actions): if self._dqda_clipping is not None: dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping, self._dqda_clipping) # mask unrelevant continuous actions for each discrete action multi_dim_actions = tf.nest.flatten( self._action_spec.q_network)[0].shape.ndims > 0 if multi_dim_actions: raise NotImplementedError( "multidimensional action space is not supported") discrete_actions_shape = tf.shape(discrete_actions) cont_action_mask = tf.cast( tf.gather_nd(self._action_params_mask, tf.reshape(discrete_actions, [-1, 1])), tf.float32) cont_action_mask = tf.reshape( cont_action_mask, tf.concat([discrete_actions_shape, [-1]], axis=-1)) loss = common.element_wise_squared_loss( tf.stop_gradient(dqda + cont_action), cont_action) tf.nest.assert_same_structure(loss, cont_action_mask) loss = loss * cont_action_mask if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec, num_outer_dims=2): # Sum over the time dimension. loss = tf.reduce_sum(loss, axis=1) if weights is not None: loss *= weights loss = tf.reduce_mean(loss) actor_losses.append(loss) actor_loss = tf.add_n(actor_losses) with tf.name_scope('Losses/'): tf.compat.v2.summary.scalar(name='actor_loss', data=actor_loss, step=self.train_step_counter) return actor_loss