def testTimeMajorBatchMajorDiscountedReturnsAreSame( self, num_time_steps, batch_size, with_final_value): rewards = np.random.rand(num_time_steps, batch_size).astype(np.float32) discounts = np.random.rand(num_time_steps, batch_size).astype(np.float32) final_value = np.random.rand(batch_size).astype( np.float32) if with_final_value else None time_major_discounted_return = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=final_value) batch_major_discounted_return = value_ops.discounted_return( rewards=tf.transpose(a=rewards), discounts=tf.transpose(a=discounts), final_value=final_value, time_major=False) self.assertAllClose(time_major_discounted_return, tf.transpose(a=batch_major_discounted_return))
def compute_return_and_advantage(self, next_time_steps, value_preds): """Compute the Monte Carlo return and advantage. Normalazation will be applied to the computed returns and advantages if it's enabled. Args: next_time_steps: batched tensor of TimeStep tuples after action is taken. value_preds: Batched value prediction tensor. Should have one more entry in time index than time_steps, with the final value corresponding to the value prediction of the final state. Returns: tuple of (return, normalized_advantage), both are batched tensors. """ #discounts = discounts * tf.constant( # self._discount_factor, dtype=tf.float32) discounts = next_time_steps.discount * tf.constant( self._discount_factor, dtype=tf.float32) rewards = next_time_steps.reward # Normalize rewards if self._reward_normalizer is defined. if self._reward_normalizer: rewards = self._reward_normalizer.normalize( rewards, center_mean=False, clip_value=self._reward_norm_clipping) #print("rew_n",rewards) # Make discount 0.0 at end of each episode to restart cumulative sum # end of each episode. episode_mask = common.get_episode_mask(next_time_steps) discounts *= episode_mask # Compute Monte Carlo returns. returns = value_ops.discounted_return(rewards, discounts, time_major=False) #print("RET",returns) # Compute advantages. advantages = self.compute_advantages(rewards, returns, discounts, value_preds) normalized_advantages = _normalize_advantages(advantages, axes=(0, 1)) # Return TD-Lambda returns if both use_td_lambda_return and use_gae. if self._use_td_lambda_return: if not self._use_gae: logging.warning( 'use_td_lambda_return was True, but use_gae was ' 'False. Using Monte Carlo return.') else: returns = tf.add(advantages, value_preds[:, :-1], name='td_lambda_returns') return returns, normalized_advantages
def _train(self, experience, weights=None): # Add a mask to ensure we reset the return calculation at episode # boundaries. This is needed in cases where episodes are truncated before # reaching a terminal state. non_last_mask = tf.cast( tf.math.not_equal(experience.next_step_type, ts.StepType.LAST), tf.float32) discounts = non_last_mask * experience.discount * self._gamma returns = value_ops.discounted_return(experience.reward, discounts, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) with tf.GradientTape() as tape: loss_info = self.total_loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') variables_to_train = self._actor_network.trainable_weights if self._baseline: variables_to_train += self._value_network.trainable_weights grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = list(zip(grads, variables_to_train)) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def _train(self, experience, weights=None): returns = value_ops.discounted_return(experience.reward, experience.discount, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: returns = _standard_normalize(returns, axes=(0, 1)) if self._debug_summaries: tf.compat.v2.summary.histogram(name='normalized_returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) variables_to_train = self._actor_network.variables with tf.GradientTape() as tape: loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def testDiscountedReturnWithFinalValueMatchPrecomputedResult(self): discounted_return = value_ops.discounted_return( rewards=tf.constant([1] * 9, dtype=tf.float32), discounts=tf.constant([1, 1, 1, 1, 0, 0.9, 0.9, 0.9, 0.9], dtype=tf.float32), final_value=tf.constant(8, dtype=tf.float32)) expected = [ 5, 4, 3, 2, 1, 8 * 0.9**4 + 3.439, 8 * 0.9**3 + 2.71, 8 * 0.9**2 + 1.9, 8 * 0.9 + 1 ] self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(discounted_return, expected)
def testDiscountedReturnIsCorrectlyComputed(self, num_time_steps, batch_size, with_final_value): rewards = np.random.rand(num_time_steps, batch_size).astype(np.float32) discounts = np.random.rand(num_time_steps, batch_size).astype(np.float32) final_value = np.random.rand(batch_size).astype( np.float32) if with_final_value else None discounted_return = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=final_value) single_discounted_return = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=final_value, provide_all_returns=False) expected = _numpy_discounted_return(rewards=rewards, discounts=discounts, final_value=final_value) self.assertAllClose(discounted_return, expected) self.assertAllClose(single_discounted_return, expected[0])
def _train(self, experience, weights=None): # TODO(b/132914246): Use .is_last() to mask the end of each episode. returns = value_ops.discounted_return(experience.reward, experience.discount * self._gamma, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) with tf.GradientTape() as tape: loss_info = self.total_loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') variables_to_train = self._actor_network.trainable_weights if self._baseline: variables_to_train += self._value_network.trainable_weights grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def _loss(self, experience, td_errors_loss_fn=common.element_wise_huber_loss, gamma=1.0, reward_scale_factor=1.0, weights=None, training=False): """Computes loss for DQN training. Args: experience: A batch of experience data in the form of a `Trajectory`. The structure of `experience` must match that of `self.policy.step_spec`. All tensors in `experience` must be shaped `[batch, time, ...]` where `time` must be equal to `self.train_sequence_length` if that property is not `None`. td_errors_loss_fn: A function(td_targets, predictions) to compute the element wise loss. gamma: Discount for future rewards. reward_scale_factor: Multiplicative factor to scale rewards. weights: Optional scalar or elementwise (per-batch-entry) importance weights. The output td_loss will be scaled by these weights, and the final scalar loss is the mean of these values. training: Whether this loss is being used for training. Returns: loss: An instance of `DqnLossInfo`. Raises: ValueError: if the number of actions is greater than 1. """ # Check that `experience` includes two outer dimensions [B, T, ...]. This # method requires a time dimension to compute the loss properly. self._check_trajectory_dimensions(experience) squeeze_time_dim = not self._q_network.state_spec if self._n_step_update == 1: time_steps, policy_steps, next_time_steps = ( trajectory.experience_to_transitions(experience, squeeze_time_dim)) actions = policy_steps.action else: # To compute n-step returns, we need the first time steps, the first # actions, and the last time steps. Therefore we extract the first and # last transitions from our Trajectory. first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience) last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience) time_steps, policy_steps, _ = ( trajectory.experience_to_transitions(first_two_steps, squeeze_time_dim)) actions = policy_steps.action _, _, next_time_steps = (trajectory.experience_to_transitions( last_two_steps, squeeze_time_dim)) with tf.name_scope('loss'): q_values = self._compute_q_values(time_steps, actions, training=training) next_q_values = self._compute_next_q_values( next_time_steps, policy_steps.info) if self._n_step_update == 1: # Special case for n = 1 to avoid a loss of performance. td_targets = compute_td_targets( next_q_values, rewards=reward_scale_factor * next_time_steps.reward, discounts=gamma * next_time_steps.discount) else: # When computing discounted return, we need to throw out the last time # index of both reward and discount, which are filled with dummy values # to match the dimensions of the observation. rewards = reward_scale_factor * experience.reward[:, :-1] discounts = gamma * experience.discount[:, :-1] # TODO(b/134618876): Properly handle Trajectories that include episode # boundaries with nonzero discount. td_targets = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=next_q_values, time_major=False, provide_all_returns=False) valid_mask = tf.cast(~time_steps.is_last(), tf.float32) td_error = valid_mask * (td_targets - q_values) td_loss = valid_mask * td_errors_loss_fn(td_targets, q_values) if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec, num_outer_dims=2): # Do a sum over the time dimension. td_loss = tf.reduce_sum(input_tensor=td_loss, axis=1) # Aggregate across the elements of the batch and add regularization loss. # Note: We use an element wise loss above to ensure each element is always # weighted by 1/N where N is the batch size, even when some of the # weights are zero due to boundary transitions. Weighting by 1/K where K # is the actual number of non-zero weight would artificially increase # their contribution in the loss. Think about what would happen as # the number of boundary samples increases. agg_loss = common.aggregate_losses( per_example_loss=td_loss, sample_weight=weights, regularization_loss=self._q_network.losses) total_loss = agg_loss.total_loss losses_dict = { 'td_loss': agg_loss.weighted, 'reg_loss': agg_loss.regularization, 'total_loss': total_loss } common.summarize_scalar_dict(losses_dict, step=self.train_step_counter, name_scope='Losses/') if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): for var in self._q_network.trainable_weights: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) if self._debug_summaries: diff_q_values = q_values - next_q_values common.generate_tensor_summaries('td_error', td_error, self.train_step_counter) common.generate_tensor_summaries('td_loss', td_loss, self.train_step_counter) common.generate_tensor_summaries('q_values', q_values, self.train_step_counter) common.generate_tensor_summaries('next_q_values', next_q_values, self.train_step_counter) common.generate_tensor_summaries('diff_q_values', diff_q_values, self.train_step_counter) return tf_agent.LossInfo( total_loss, DqnLossInfo(td_loss=td_loss, td_error=td_error))
def _loss(self, experience, td_errors_loss_fn=tf.losses.huber_loss, gamma=1.0, reward_scale_factor=1.0, weights=None): """Computes critic loss for CategoricalDQN training. See Algorithm 1 and the discussion immediately preceding it in page 6 of "A Distributional Perspective on Reinforcement Learning" Bellemare et al., 2017 https://arxiv.org/abs/1707.06887 Args: experience: A batch of experience data in the form of a `Trajectory`. The structure of `experience` must match that of `self.policy.step_spec`. All tensors in `experience` must be shaped `[batch, time, ...]` where `time` must be equal to `self.required_experience_time_steps` if that property is not `None`. td_errors_loss_fn: A function(td_targets, predictions) to compute loss. gamma: Discount for future rewards. reward_scale_factor: Multiplicative factor to scale rewards. weights: Optional weights used for importance sampling. Returns: critic_loss: A scalar critic loss. Raises: ValueError: if the number of actions is greater than 1. """ # Check that `experience` includes two outer dimensions [B, T, ...]. This # method requires a time dimension to compute the loss properly. self._check_trajectory_dimensions(experience) if self._n_step_update == 1: time_steps, actions, next_time_steps = self._experience_to_transitions( experience) else: # To compute n-step returns, we need the first time steps, the first # actions, and the last time steps. Therefore we extract the first and # last transitions from our Trajectory. first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience) last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience) time_steps, actions, _ = self._experience_to_transitions( first_two_steps) _, _, next_time_steps = self._experience_to_transitions( last_two_steps) with tf.name_scope('critic_loss'): tf.nest.assert_same_structure(actions, self.action_spec) tf.nest.assert_same_structure(time_steps, self.time_step_spec) tf.nest.assert_same_structure(next_time_steps, self.time_step_spec) rank = nest_utils.get_outer_rank(time_steps.observation, self._time_step_spec.observation) # If inputs have a time dimension and the q_network is stateful, # combine the batch and time dimension. batch_squash = (None if rank <= 1 or self._q_network.state_spec in ((), None) else utils.BatchSquash(rank)) # q_logits contains the Q-value logits for all actions. q_logits, _ = self._q_network(time_steps.observation, time_steps.step_type) next_q_distribution = self._next_q_distribution( next_time_steps, batch_squash) if batch_squash is not None: # Squash outer dimensions to a single dimensions for facilitation # computing the loss the following. Required for supporting temporal # inputs, for example. q_logits = batch_squash.flatten(q_logits) actions = batch_squash.flatten(actions) next_time_steps = tf.nest.map_structure( batch_squash.flatten, next_time_steps) actions = tf.nest.flatten(actions)[0] if actions.shape.ndims > 1: actions = tf.squeeze(actions, range(1, actions.shape.ndims)) # Project the sample Bellman update \hat{T}Z_{\theta} onto the original # support of Z_{\theta} (see Figure 1 in paper). batch_size = tf.shape(q_logits)[0] tiled_support = tf.tile(self._support, [batch_size]) tiled_support = tf.reshape(tiled_support, [batch_size, self._num_atoms]) if self._n_step_update == 1: discount = next_time_steps.discount if discount.shape.ndims == 1: # We expect discount to have a shape of [batch_size], while # tiled_support will have a shape of [batch_size, num_atoms]. To # multiply these, we add a second dimension of 1 to the discount. discount = discount[:, None] next_value_term = tf.multiply(discount, tiled_support, name='next_value_term') reward = next_time_steps.reward if reward.shape.ndims == 1: # See the explanation above. reward = reward[:, None] reward_term = tf.multiply(reward_scale_factor, reward, name='reward_term') target_support = tf.add(reward_term, gamma * next_value_term, name='target_support') else: # When computing discounted return, we need to throw out the last time # index of both reward and discount, which are filled with dummy values # to match the dimensions of the observation. rewards = reward_scale_factor * experience.reward[:, :-1] discounts = gamma * experience.discount[:, :-1] # TODO(b/134618876): Properly handle Trajectories that include episode # boundaries with nonzero discount. # TODO(b/131557265): Replace value_ops.discounted_return with a method # that only computes the single value needed. discounted_rewards = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=tf.zeros([batch_size], dtype=discounts.dtype), time_major=False) # We only need the first value within the time dimension which # corresponds to the full final return. The remaining values are only # partial returns. discounted_rewards = discounted_rewards[:, :1] final_value_discount = tf.reduce_prod(discounts, axis=1) final_value_discount = final_value_discount[:, None] # Save the values of discounted_rewards and final_value_discount in # order to check them in unit tests. self._discounted_rewards = discounted_rewards self._final_value_discount = final_value_discount target_support = tf.add(discounted_rewards, final_value_discount * tiled_support, name='target_support') target_distribution = tf.stop_gradient( project_distribution(target_support, next_q_distribution, self._support)) # Obtain the current Q-value logits for the selected actions. indices = tf.range(tf.shape(q_logits)[0])[:, None] indices = tf.cast(indices, actions.dtype) reshaped_actions = tf.concat([indices, actions[:, None]], 1) chosen_action_logits = tf.gather_nd(q_logits, reshaped_actions) # Compute the cross-entropy loss between the logits. If inputs have # a time dimension, compute the sum over the time dimension before # computing the mean over the batch dimension. if batch_squash is not None: target_distribution = batch_squash.unflatten( target_distribution) chosen_action_logits = batch_squash.unflatten( chosen_action_logits) critic_loss = tf.reduce_mean( tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2( labels=target_distribution, logits=chosen_action_logits), axis=1)) else: critic_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=target_distribution, logits=chosen_action_logits)) with tf.name_scope('Losses/'): tf.compat.v2.summary.scalar('critic_loss', critic_loss, step=self.train_step_counter) if self._debug_summaries: distribution_errors = target_distribution - chosen_action_logits with tf.name_scope('distribution_errors'): common.generate_tensor_summaries( 'distribution_errors', distribution_errors, step=self.train_step_counter) tf.compat.v2.summary.scalar( 'mean', tf.reduce_mean(distribution_errors), step=self.train_step_counter) tf.compat.v2.summary.scalar( 'mean_abs', tf.reduce_mean(tf.abs(distribution_errors)), step=self.train_step_counter) tf.compat.v2.summary.scalar( 'max', tf.reduce_max(distribution_errors), step=self.train_step_counter) tf.compat.v2.summary.scalar( 'min', tf.reduce_min(distribution_errors), step=self.train_step_counter) with tf.name_scope('target_distribution'): common.generate_tensor_summaries( 'target_distribution', target_distribution, step=self.train_step_counter) # TODO(b/127318640): Give appropriate values for td_loss and td_error for # prioritized replay. return tf_agent.LossInfo( critic_loss, dqn_agent.DqnLossInfo(td_loss=(), td_error=()))
def to_n_step_transition( trajectory: Trajectory, gamma: types.Float ) -> Transition: """Create an n-step transition from a trajectory with `T=N + 1` frames. **NOTE** Tensors of `trajectory` are sliced along their *second* (`time`) dimension, to pull out the appropriate fields for the n-step transitions. The output transition's `next_time_step.{reward, discount}` will contain N-step discounted reward and discount values calculated as: ``` next_time_step.reward = r_t + g^{1} * d_t * r_{t+1} + g^{2} * d_t * d_{t+1} * r_{t+2} + g^{3} * d_t * d_{t+1} * d_{t+2} * r_{t+3} + ... g^{N-1} * d_t * ... * d_{t+N-2} * r_{t+N-1} next_time_step.discount = g^{N-1} * d_t * d_{t+1} * ... * d_{t+N-1} ``` In python notation: ```python discount = gamma**(N-1) * reduce_prod(trajectory.discount[:, :-1]) reward = discounted_return( rewards=trajectory.reward[:, :-1], discounts=gamma * trajectory.discount[:, :-1]) ``` When `trajectory.discount[:, :-1]` is an all-ones tensor, this is equivalent to: ```python next_time_step.discount = ( gamma**(N-1) * tf.ones_like(trajectory.discount[:, 0])) next_time_step.reward = ( sum_{n=0}^{N-1} gamma**n * trajectory.reward[:, n]) ``` Args: trajectory: An instance of `Trajectory`. The tensors in Trajectory must have shape `[B, T, ...]`. `discount` is assumed to be a scalar float, hence the shape of `trajectory.discount` must be `[B, T]`. gamma: A floating point scalar; the discount factor. Returns: An N-step `Transition` where `N = T - 1`. The reward and discount in `time_step.{reward, discount}` are NaN. The n-step discounted reward and final discount are stored in `next_time_step.{reward, discount}`. All tensors in the `Transition` have shape `[B, ...]` (no time dimension). Raises: ValueError: if `discount.shape.rank != 2`. ValueError: if `discount.shape[1] < 2`. """ _validate_rank(trajectory.discount, min_rank=2, max_rank=2) # Use static values when available, so that we can use XLA when the time # dimension is fixed. time_dim = (tf.compat.dimension_value(trajectory.discount.shape[1]) or tf.shape(trajectory.discount)[1]) static_time_dim = tf.get_static_value(time_dim) if static_time_dim in (0, 1): raise ValueError( 'Trajectory frame count must be at least 2, but saw {}. Shape of ' 'trajectory.discount: {}'.format(static_time_dim, trajectory.discount.shape)) n = time_dim - 1 # Use composite calculations to ensure we properly handle SparseTensor etc in # the observations. # pylint: disable=g-long-lambda # Pull out x[:,0] for x in trajectory first_frame = tf.nest.map_structure( lambda t: composite.squeeze( composite.slice_to(t, axis=1, end=1), axis=1), trajectory) # Pull out x[:,-1] for x in trajectory final_frame = tf.nest.map_structure( lambda t: composite.squeeze( composite.slice_from(t, axis=1, start=-1), axis=1), trajectory) # pylint: enable=g-long-lambda # When computing discounted return, we need to throw out the last time # index of both reward and discount, which are filled with dummy values # to match the dimensions of the observation. reward = trajectory.reward[:, :-1] discount = trajectory.discount[:, :-1] policy_steps = policy_step.PolicyStep( action=first_frame.action, state=(), info=first_frame.policy_info) discounted_reward = value_ops.discounted_return( rewards=reward, discounts=gamma * discount, time_major=False, provide_all_returns=False) # NOTE: `final_discount` will have one less discount than `discount`. # This is so that when the learner/update uses an additional # discount (e.g. gamma) we don't apply it twice. final_discount = gamma**(n-1) * tf.math.reduce_prod(discount, axis=1) time_steps = ts.TimeStep( first_frame.step_type, # unknown reward=tf.nest.map_structure( lambda r: np.nan * tf.ones_like(r), first_frame.reward), # unknown discount=np.nan * tf.ones_like(first_frame.discount), observation=first_frame.observation) next_time_steps = ts.TimeStep( step_type=final_frame.step_type, reward=discounted_reward, discount=final_discount, observation=final_frame.observation) return Transition(time_steps, policy_steps, next_time_steps)
def critic_loss(self, experience, gamma=1.0, weights=None): """Computes the critic loss for TD3 training. Args: experience: A batch of timesteps. gamma: reward discount factor weights: Optional scalar or element-wise (per-batch-entry) importance weights. Returns: critic_loss: A scalar critic loss. """ with tf.name_scope('critic_loss'): self._check_trajectory_dimensions(experience) if self._n_step_update == 1: time_steps, actions, next_time_steps = self._experience_to_transitions( experience) else: # To compute n-step returns, we need the first time steps, the first # actions, and the last time steps. Therefore we extract the first and # last transitions from our Trajectory. first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience) last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience) time_steps, actions, _ = self._experience_to_transitions(first_two_steps) _, _, next_time_steps = self._experience_to_transitions(last_two_steps) # Target q-values are the min of the two networks # print("first pass") target_q_values_1 = self._compute_next_q_values(self._target_q_value_policies_1, next_time_steps) target_q_values_2 = self._compute_next_q_values(self._target_q_value_policies_2, next_time_steps) target_q_values = tf.minimum(target_q_values_1, target_q_values_2) if self._n_step_update == 1: # Special case for n = 1 to avoid a loss of performance. td_targets = compute_td_targets( target_q_values, rewards=self._reward_scale_factor * next_time_steps.reward, discounts=gamma * next_time_steps.discount) else: # When computing discounted return, we need to throw out the last time # index of both reward and discount, which are filled with dummy values # to match the dimensions of the observation. rewards = self._reward_scale_factor * experience.reward[:, :-1] discounts = gamma * experience.discount[:, :-1] td_targets = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=target_q_values, time_major=False, provide_all_returns=False) # td_targets = tf.stop_gradient( # self._reward_scale_factor * next_time_steps.reward + # self._gamma * next_time_steps.discount * target_q_values) # print("second pass") pred_td_targets_1 = self._compute_q_values(self._q_value_policies_1, time_steps, actions) pred_td_targets_2 = self._compute_q_values(self._q_value_policies_2, time_steps, actions) pred_td_targets_all = [pred_td_targets_1, pred_td_targets_2] # print("third pass") if self._debug_summaries: tf.compat.v2.summary.histogram( name='td_targets', data=td_targets, step=self.train_step_counter) with tf.name_scope('td_targets'): tf.compat.v2.summary.scalar( name='mean', data=tf.reduce_mean(input_tensor=td_targets), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='max', data=tf.reduce_max(input_tensor=td_targets), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='min', data=tf.reduce_min(input_tensor=td_targets), step=self.train_step_counter) for td_target_idx in range(2): pred_td_targets = pred_td_targets_all[td_target_idx] td_errors = td_targets - pred_td_targets with tf.name_scope('critic_net_%d' % (td_target_idx + 1)): tf.compat.v2.summary.histogram( name='td_errors', data=td_errors, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='pred_td_targets', data=pred_td_targets, step=self.train_step_counter) with tf.name_scope('td_errors'): tf.compat.v2.summary.scalar( name='mean', data=tf.reduce_mean(input_tensor=td_errors), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='mean_abs', data=tf.reduce_mean(input_tensor=tf.abs(td_errors)), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='max', data=tf.reduce_max(input_tensor=td_errors), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='min', data=tf.reduce_min(input_tensor=td_errors), step=self.train_step_counter) with tf.name_scope('pred_td_targets'): tf.compat.v2.summary.scalar( name='mean', data=tf.reduce_mean(input_tensor=pred_td_targets), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='max', data=tf.reduce_max(input_tensor=pred_td_targets), step=self.train_step_counter) tf.compat.v2.summary.scalar( name='min', data=tf.reduce_min(input_tensor=pred_td_targets), step=self.train_step_counter) critic_loss = (self._td_errors_loss_fn(td_targets, pred_td_targets_1) + self._td_errors_loss_fn(td_targets, pred_td_targets_2)) if nest_utils.is_batched_nested_tensors( time_steps, self.time_step_spec, num_outer_dims=2): # Sum over the time dimension. critic_loss = tf.reduce_sum(input_tensor=critic_loss, axis=1) if weights is not None: critic_loss *= weights # print("forth pass") # regularization_loss = self._embedding_loss(self._q_network_1) + self._embedding_loss(self._q_network_2) return tf.reduce_mean(input_tensor=critic_loss)
def compute_return_and_advantage(self, next_time_steps, value_preds): """Compute the Monte Carlo return and advantage. Normalazation will be applied to the computed returns and advantages if it's enabled. Args: next_time_steps: batched tensor of TimeStep tuples after action is taken. value_preds: Batched value prediction tensor. Should have one more entry in time index than time_steps, with the final value corresponding to the value prediction of the final state. Returns: tuple of (return, normalized_advantage), both are batched tensors. """ discounts = next_time_steps.discount * tf.constant( self._discount_factor, dtype=tf.float32) rewards = next_time_steps.reward if self._debug_summaries: # Summarize rewards before they get normalized below. tf.compat.v2.summary.histogram( name='rewards', data=rewards, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='rewards_mean', data=tf.reduce_mean(rewards), step=self.train_step_counter) # Normalize rewards if self._reward_normalizer is defined. if self._reward_normalizer: rewards = self._reward_normalizer.normalize( rewards, center_mean=False, clip_value=self._reward_norm_clipping) if self._debug_summaries: tf.compat.v2.summary.histogram( name='rewards_normalized', data=rewards, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='rewards_normalized_mean', data=tf.reduce_mean(rewards), step=self.train_step_counter) # Make discount 0.0 at end of each episode to restart cumulative sum # end of each episode. episode_mask = common.get_episode_mask(next_time_steps) discounts *= episode_mask # Compute Monte Carlo returns. Data from incomplete trajectories, not # containing the end of an episode will also be used, with a bootstrapped # estimation from the last value. # Note that when a trajectory driver is used, then the final step is # terminal, the bootstrapped estimation will not be used, as it will be # multiplied by zero (the discount on the last step). final_value_bootstrapped = value_preds[:, -1] returns = value_ops.discounted_return( rewards, discounts, time_major=False, final_value=final_value_bootstrapped) if self._debug_summaries: tf.compat.v2.summary.histogram( name='returns', data=returns, step=self.train_step_counter) # Compute advantages. advantages = self.compute_advantages(rewards, returns, discounts, value_preds) normalized_advantages = _normalize_advantages(advantages, axes=(0, 1)) if self._debug_summaries: tf.compat.v2.summary.histogram( name='advantages', data=advantages, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='advantages_normalized', data=normalized_advantages, step=self.train_step_counter) # Return TD-Lambda returns if both use_td_lambda_return and use_gae. if self._use_td_lambda_return: if not self._use_gae: logging.warning('use_td_lambda_return was True, but use_gae was ' 'False. Using Monte Carlo return.') else: returns = tf.add( advantages, value_preds[:, :-1], name='td_lambda_returns') return returns, normalized_advantages
def _loss(self, experience, td_errors_loss_fn=element_wise_huber_loss, gamma=1.0, reward_scale_factor=1.0, weights=None): """Computes loss for DQN training. Args: experience: A batch of experience data in the form of a `Trajectory`. The structure of `experience` must match that of `self.policy.step_spec`. All tensors in `experience` must be shaped `[batch, time, ...]` where `time` must be equal to `self.train_sequence_length` if that property is not `None`. td_errors_loss_fn: A function(td_targets, predictions) to compute the element wise loss. gamma: Discount for future rewards. reward_scale_factor: Multiplicative factor to scale rewards. weights: Optional scalar or elementwise (per-batch-entry) importance weights. The output td_loss will be scaled by these weights, and the final scalar loss is the mean of these values. Returns: loss: An instance of `DqnLossInfo`. Raises: ValueError: if the number of actions is greater than 1. """ # Check that `experience` includes two outer dimensions [B, T, ...]. This # method requires `experience` to include the time dimension. self._check_trajectory_dimensions(experience) if self._n_step_update == 1: time_steps, actions, next_time_steps = self._experience_to_transitions( experience) else: # To compute n-step returns, we need the first time steps, the first # actions, and the last time steps. Therefore we extract the first and # last transitions from our Trajectory. first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience) last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience) time_steps, actions, _ = self._experience_to_transitions(first_two_steps) _, _, next_time_steps = self._experience_to_transitions(last_two_steps) with tf.name_scope('loss'): actions = tf.nest.flatten(actions)[0] q_values, _ = self._q_network(time_steps.observation, time_steps.step_type) # Handle action_spec.shape=(), and shape=(1,) by using the # multi_dim_actions param. multi_dim_actions = tf.nest.flatten(self._action_spec)[0].shape.ndims > 0 q_values = common.index_with_actions( q_values, tf.cast(actions, dtype=tf.int32), multi_dim_actions=multi_dim_actions) next_q_values = self._compute_next_q_values(next_time_steps) if self._n_step_update == 1: # Special case for n = 1 to avoid a loss of performance. td_targets = compute_td_targets( next_q_values, rewards=reward_scale_factor * next_time_steps.reward, discounts=gamma * next_time_steps.discount) else: # When computing discounted return, we need to throw out the last time # index of both reward and discount, which are filled with dummy values # to match the dimensions of the observation. # TODO(b/131557265): Replace value_ops.discounted_return with a method # that only computes the single value needed. n_step_return = value_ops.discounted_return( rewards=reward_scale_factor * experience.reward[:, :-1], discounts=gamma * experience.discount[:, :-1], final_value=next_q_values, time_major=False) # We only need the first value within the time dimension which # corresponds to the full final return. The remaining values are only # partial returns. td_targets = n_step_return[:, 0] valid_mask = tf.cast(~time_steps.is_last(), tf.float32) td_error = valid_mask * (td_targets - q_values) td_loss = valid_mask * td_errors_loss_fn(td_targets, q_values) if nest_utils.is_batched_nested_tensors( time_steps, self.time_step_spec, num_outer_dims=2): # Do a sum over the time dimension. td_loss = tf.reduce_sum(input_tensor=td_loss, axis=1) if weights is not None: td_loss *= weights # Average across the elements of the batch. # Note: We use an element wise loss above to ensure each element is always # weighted by 1/N where N is the batch size, even when some of the # weights are zero due to boundary transitions. Weighting by 1/K where K # is the actual number of non-zero weight would artificially increase # their contribution in the loss. Think about what would happen as # the number of boundary samples increases. loss = tf.reduce_mean(input_tensor=td_loss) with tf.name_scope('Losses/'): tf.compat.v1.summary.scalar( 'loss_' + self.name, loss, collections=['train_' + self.name]) # family=self.name) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): for var in self._q_network.trainable_weights: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) if self._debug_summaries: diff_q_values = q_values - next_q_values common.generate_tensor_summaries('td_error', td_error, self.train_step_counter) common.generate_tensor_summaries('td_loss', td_loss, self.train_step_counter) common.generate_tensor_summaries('q_values', q_values, self.train_step_counter) common.generate_tensor_summaries('next_q_values', next_q_values, self.train_step_counter) common.generate_tensor_summaries('diff_q_values', diff_q_values, self.train_step_counter) return tf_agent.LossInfo(loss, DqnLossInfo(td_loss=td_loss, td_error=td_error))
def _loss(self, experience, td_errors_loss_fn=common.element_wise_huber_loss, gamma=1.0, reward_scale_factor=1.0, weights=None): self._check_trajectory_dimensions(experience) if self._n_step_update == 1: time_steps, actions, next_time_steps = self._experience_to_transitions( experience) else: first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience) last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience) time_steps, actions, _ = self._experience_to_transitions( first_two_steps) _, _, next_time_steps = self._experience_to_transitions( last_two_steps) with tf.name_scope("loss"): q_values = self._compute_q_values(time_steps, actions) next_q_values = self._compute_next_q_values(next_time_steps) if self._n_step_update == 1: td_targets = compute_td_targets( next_q_values, rewards=reward_scale_factor * next_time_steps.reward, discounts=gamma * next_time_steps.discount) else: rewards = reward_scale_factor * experience.reward[:, :-1] discounts = gamma * experience.discount[:, :-1] td_targets = value_ops.discounted_return( rewards=rewards, discounts=discounts, final_value=next_q_values, time_major=False, provide_all_returns=False) valid_mask = tf.cast(~time_steps.is_last(), tf.float32) td_error = valid_mask * (td_targets - q_values) td_loss = valid_mask * td_errors_loss_fn(td_targets, q_values) if nest_utils.is_batched_nested_tensors(time_steps, self.time_step_spec, num_outer_dims=2): td_loss = tf.reduce_sum(input_tensor=td_loss, axis=1) if weights is not None: td_loss *= weights loss = tf.reduce_mean(input_tensor=td_loss) if self._q_network.losses: loss = loss + tf.reduce_mean(self._q_network.losses) with tf.name_scope("Losses/"): tf.compat.v2.summary.scalar(name="loss", data=loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope("Variables/"): for var in self._q_network.trainable_weights: tf.compat.v2.summary.historgram( name=var.name.replace(":", "_"), data=var, step=self.train_step_counter) if self._debug_summaries: diff_q_values = q_values - next_q_values common.generate_tensor_summaries("td_error", td_error, self.train_step_counter) common.generate_tensor_summaries("td_loss", td_loss, self.train_step_counter) common.generate_tensor_summaries("q_values", q_values, self.train_step_counter) common.generate_tensor_summaries("next_q_values", next_q_values, self.train_step_counter) common.generate_tensor_summaries("diff_q_values", diff_q_values, self.train_step_counter) return tf_agent.LossInfo( loss, DqnLossInfo(td_loss=td_loss, td_error=td_error))