def call(self, trajectory: Trajectory) -> Trajectory: time_step = TimeStep(trajectory.step_type, trajectory.reward, trajectory.discount, trajectory.observation) action_dist = self._policy.distribution(time_step).action # If the action distribution is in fact a tuple of distributions (one for each resource set) # then we need to index into them to attain the underlying distribution which can then be # used to attain probabilities. This is only the case where there are multiple resource # sets. for i in self._action_indices[:-1]: action_dist = action_dist[i] action_probs = action_dist.probs_parameter() # Zero out batch indices where a new episode is starting. self._probability_accumulator.assign( tf.where(trajectory.is_first(), tf.zeros_like(self._probability_accumulator), self._probability_accumulator)) self._count_accumulator.assign( tf.where(trajectory.is_first(), tf.zeros_like(self._count_accumulator), self._count_accumulator)) # Update accumulators with probability and count increments. self._probability_accumulator.assign_add(action_probs[..., 0, self._action_indices[-1]]) self._count_accumulator.assign_add(tf.ones_like(self._count_accumulator)) # Add final cumulants to buffer at the end of episodes. last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()), axis=-1) for idx in last_episode_indices: self._buffer.add(self._probability_accumulator[idx] / self._count_accumulator[idx]) return trajectory
def call(self, trajectory: traj.Trajectory): if trajectory.step_type.ndim == 0: trajectory = nest_utils.batch_nested_array(trajectory) completed_episodes = np.sum(trajectory.is_last().astype(np.int64)) self._np_state.number_episodes += completed_episodes
def total_loss(self, experience: traj.Trajectory, returns: types.Tensor, weights: types.Tensor, training: bool = False) -> tf_agent.LossInfo: # Ensure we see at least one full episode. time_steps = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) is_last = experience.is_last() num_episodes = tf.reduce_sum(tf.cast(is_last, tf.float32)) tf.debugging.assert_greater( num_episodes, 0.0, message= 'No complete episode found. REINFORCE requires full episodes ' 'to compute losses.') # Mask out partial episodes at the end of each batch of time_steps. # NOTE: We use is_last rather than is_boundary because the last transition # is the transition with the last valid reward. In other words, the # reward on the boundary transitions do not have valid rewards. Since # REINFORCE is calculating a loss w.r.t. the returns (and not bootstrapping) # keeping the boundary transitions is irrelevant. valid_mask = tf.cast(experience.is_last(), dtype=tf.float32) valid_mask = tf.math.cumsum(valid_mask, axis=1, reverse=True) valid_mask = tf.cast(valid_mask > 0, dtype=tf.float32) if weights is not None: weights *= valid_mask else: weights = valid_mask advantages = returns value_preds = None if self._baseline: value_preds, _ = self._value_network(time_steps.observation, time_steps.step_type, training=True) if self._debug_summaries: tf.compat.v2.summary.histogram(name='value_preds', data=value_preds, step=self.train_step_counter) advantages = self._advantage_fn(returns, value_preds) if self._debug_summaries: tf.compat.v2.summary.histogram(name='advantages', data=advantages, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: advantages = _standard_normalize(advantages, axes=(0, 1)) if self._debug_summaries: tf.compat.v2.summary.histogram( name='normalized_%s' % ('advantages' if self._baseline else 'returns'), data=advantages, step=self.train_step_counter) nest_utils.assert_same_structure(time_steps, self.time_step_spec) policy_state = _get_initial_policy_state(self.collect_policy, time_steps) actions_distribution = self.collect_policy.distribution( time_steps, policy_state=policy_state).action policy_gradient_loss = self.policy_gradient_loss( actions_distribution, experience.action, experience.is_boundary(), advantages, num_episodes, weights, ) entropy_regularization_loss = self.entropy_regularization_loss( actions_distribution, weights) network_regularization_loss = tf.nn.scale_regularization_loss( self._actor_network.losses) total_loss = (policy_gradient_loss + network_regularization_loss + entropy_regularization_loss) losses_dict = { 'policy_gradient_loss': policy_gradient_loss, 'policy_network_regularization_loss': network_regularization_loss, 'entropy_regularization_loss': entropy_regularization_loss, 'value_estimation_loss': 0.0, 'value_network_regularization_loss': 0.0, } value_estimation_loss = None if self._baseline: value_estimation_loss = self.value_estimation_loss( value_preds, returns, num_episodes, weights) value_network_regularization_loss = tf.nn.scale_regularization_loss( self._value_network.losses) total_loss += value_estimation_loss + value_network_regularization_loss losses_dict['value_estimation_loss'] = value_estimation_loss losses_dict['value_network_regularization_loss'] = ( value_network_regularization_loss) loss_info_extra = ReinforceAgentLossInfo(**losses_dict) losses_dict[ 'total_loss'] = total_loss # Total loss not in loss_info_extra. common.summarize_scalar_dict(losses_dict, self.train_step_counter, name_scope='Losses/') return tf_agent.LossInfo(total_loss, loss_info_extra)