Beispiel #1
0
    def _train(self, experience, weights=None):
        # TODO(b/126593927): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')

        experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                           experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
Beispiel #2
0
    def _apply_loss(self, aggregated_losses, variables_to_train, tape,
                    optimizer):
        total_loss = aggregated_losses.total_loss
        tf.debugging.check_numerics(total_loss, "Loss is inf or nan")
        assert list(variables_to_train), "No variables in the agent's network."

        grads = tape.gradient(total_loss, variables_to_train)
        grads_and_vars = list(zip(grads, variables_to_train))

        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self.summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)

        optimizer.apply_gradients(grads_and_vars)

        if self.summaries_enabled:
            dict_losses = {
                "loss": aggregated_losses.weighted,
                "reg_loss": aggregated_losses.regularization,
                "total_loss": total_loss
            }
            common.summarize_scalar_dict(dict_losses,
                                         step=self.train_step_counter,
                                         name_scope="Losses/")
    def _train(self, experience: types.NestedTensor,
               weights: types.Tensor) -> tf_agent.LossInfo:
        experience = self._as_trajectory(experience)

        with tf.GradientTape() as tape:
            loss_info = self._loss(experience, weights=weights, training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
Beispiel #4
0
  def _train(self, experience, weights):
    with tf.GradientTape() as tape:
      loss_info = self._loss(
          experience,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights,
          training=True)
    tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    non_trainable_weights = self._q_network.non_trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = list(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      grads_and_vars_with_non_trainable = (
          grads_and_vars + [(None, v) for v in non_trainable_weights])
      eager_utils.add_variables_summaries(grads_and_vars_with_non_trainable,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)
    self._optimizer.apply_gradients(grads_and_vars)
    self.train_step_counter.assign_add(1)

    self._update_target()

    return loss_info
Beispiel #5
0
  def _train_v1(self, experience, weights):
    with tf.GradientTape() as tape:
      loss_info = self._loss(
          experience,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    train_op = self._optimizer.apply_gradients(
        grads_and_vars, global_step=self.train_step_counter)

    update_op = self._update_target()
    train_op = tf.group(train_op, update_op)

    return train_op, loss_info
Beispiel #6
0
    def _train(self, experience, weights):
        (observations, actions,
         rewards) = bandit_utils.process_experience_for_neural_agents(
             experience, self._observation_and_action_constraint_splitter,
             self._accepts_per_arm_features, self.training_data_spec)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
Beispiel #7
0
 def _create_summaries(grads_and_vars):
     eager_utils.add_gradients_summaries(grads_and_vars,
                                         self.train_step_counter)
     eager_utils.add_variables_summaries(grads_and_vars,
                                         self.train_step_counter)
     grads_and_vars = clip_gradients(grads_and_vars)
     return grads_and_vars
Beispiel #8
0
  def _train(self, experience, weights):
    time_steps, actions, next_time_steps = self._experience_to_transitions(
        experience)

    with tf.GradientTape() as tape:
      loss_info = self.loss(time_steps,
                            actions,
                            next_time_steps,
                            td_errors_loss_fn=self._td_errors_loss_fn,
                            gamma=self._gamma,
                            reward_scale_factor=self._reward_scale_factor,
                            weights=weights)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    grads_and_vars = zip(grads, variables_to_train)
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    self._optimizer.apply_gradients(grads_and_vars,
                                    global_step=self.train_step_counter)

    self._update_target()

    return loss_info
    def compute_loss_using_reward_layer(
            self,
            observation: types.NestedTensor,
            action: types.Tensor,
            reward: types.Tensor,
            weights: Optional[types.Float] = None,
            training: bool = False) -> tf_agent.LossInfo:
        """Computes loss using the reward layer.

    Args:
      observation: A batch of observations.
      action: A batch of actions.
      reward: A batch of rewards.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output batch loss will be scaled by these weights, and
        the final scalar loss is the mean of these values.
      training: Whether the loss is being used for training.

    Returns:
      loss: A `LossInfo` containing the loss for the training step.
    """
        # Update the neural network params.
        with tf.GradientTape() as tape:
            loss_info = self._loss_using_reward_layer(observation,
                                                      action,
                                                      reward,
                                                      weights,
                                                      training=training)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        tf.compat.v2.summary.scalar(name='using_reward_layer',
                                    data=1,
                                    step=self.train_step_counter)
        if self._summarize_grads_and_vars:
            self.compute_summaries(loss_info.loss)
        variables_to_train = (self._encoding_network.trainable_weights +
                              self._reward_layer.trainable_weights)
        if not variables_to_train:
            raise ValueError('No variable to train in the agent.')

        grads = tape.gradient(loss_info.loss, variables_to_train)
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Reward_network/'):
                eager_utils.add_variables_summaries(grads_and_vars,
                                                    self.train_step_counter)
                eager_utils.add_gradients_summaries(grads_and_vars,
                                                    self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
Beispiel #10
0
    def _train(self, experience, weights=None):
        # Add a mask to ensure we reset the return calculation at episode
        # boundaries. This is needed in cases where episodes are truncated before
        # reaching a terminal state.
        non_last_mask = tf.cast(
            tf.math.not_equal(experience.next_step_type, ts.StepType.LAST),
            tf.float32)
        discounts = non_last_mask * experience.discount * self._gamma
        returns = value_ops.discounted_return(experience.reward,
                                              discounts,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        with tf.GradientTape() as tape:
            loss_info = self.total_loss(time_step,
                                        experience.action,
                                        tf.stop_gradient(returns),
                                        weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        variables_to_train = self._actor_network.trainable_weights
        if self._baseline:
            variables_to_train += self._value_network.trainable_weights
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = list(zip(grads, variables_to_train))
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
Beispiel #11
0
    def _train(self, experience, weights):
        rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self.training_data_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if self._accepts_per_arm_features:
            # The arm observation we train on needs to be copied from the respective
            # policy info field to the per arm observation field. Pretending there was
            # only one action, we fill the action field with zeros.
            chosen_action, _ = nest_utils.flatten_multi_batched_nested_tensors(
                experience.policy_info.chosen_arm_features,
                self.policy.info_spec.chosen_arm_features)
            observations[
                bandit_spec_utils.PER_ARM_FEATURE_KEY] = tf.expand_dims(
                    chosen_action, axis=1)
            actions = tf.zeros_like(actions)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        self.compute_summaries(loss_info.loss)
        variables_to_train = self._reward_network.trainable_weights
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)

        return loss_info
Beispiel #12
0
    def _train(self, experience, weights=None):
        returns = value_ops.discounted_return(experience.reward,
                                              experience.discount,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            returns = _standard_normalize(returns, axes=(0, 1))
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
Beispiel #13
0
    def _apply_gradients(self, gradients, variables, optimizer):
        grads_and_vars = list(zip(gradients, variables))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        optimizer.apply_gradients(grads_and_vars)
  def _apply_gradients(self, gradients, variables, optimizer):
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(gradients, variables))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    return optimizer.apply_gradients(grads_and_vars)
Beispiel #15
0
    def _train(self, experience, weights=None):
        # TODO(b/132914246): Use .is_last() to mask the end of each episode.
        returns = value_ops.discounted_return(experience.reward,
                                              experience.discount *
                                              self._gamma,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        with tf.GradientTape() as tape:
            loss_info = self.total_loss(time_step,
                                        experience.action,
                                        tf.stop_gradient(returns),
                                        weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        variables_to_train = self._actor_network.trainable_weights
        if self._baseline:
            variables_to_train += self._value_network.trainable_weights
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
    def _train(self, experience: types.NestedTensor,
               weights: types.Tensor) -> tf_agent.LossInfo:
        (observations, actions,
         objective_values) = bandit_utils.process_experience_for_neural_agents(
             experience, self._accepts_per_arm_features,
             self.training_data_spec)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if objective_values.shape.rank != 2:
            raise ValueError(
                'The objectives tensor should be rank-2 [batch_size, num_objectives],'
                ' but found to be rank-{}'.format(objective_values.shape.rank))
        if objective_values.shape[1] != self._num_objectives:
            raise ValueError(
                'The number of objectives in the objective_values tensor: {} '
                'is different from the number of objective networks: {}.'.
                format(objective_values.shape[1], self._num_objectives))

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  objective_values,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
Beispiel #17
0
 def _create_summaries(grads_and_vars):
     grads_and_vars = eager_utils.add_gradients_summaries(
         grads_and_vars)
     grads_and_vars = eager_utils.add_variables_summaries(
         grads_and_vars)
     grads_and_vars = clip_gradients(grads_and_vars)
     return grads_and_vars
Beispiel #18
0
    def _train(self, experience, weights):
        rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        self.compute_summaries(loss_info.loss)
        variables_to_train = self._reward_network.trainable_weights
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)

        return loss_info
Beispiel #19
0
    def _train(self, experience, weights):
        experience = self._as_trajectory(experience)

        with tf.GradientTape() as tape:
            loss_info = self._loss(experience, weights=weights, training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)
        if not self._accepts_per_arm_features and self._num_samples_list:
            # Compute the number of samples for each action in the current batch.
            actions_flattened = tf.reshape(experience.action, [-1])
            num_samples_per_action_current = [
                tf.reduce_sum(tf.cast(tf.equal(actions_flattened, k),
                                      tf.int64))
                for k in range(self._num_actions)
            ]
            # Update the number of samples for each action.
            for a, b in zip(self._num_samples_list,
                            num_samples_per_action_current):
                tf.compat.v1.assign_add(a, b)

        return loss_info
Beispiel #20
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        actions = policy_steps_.action

        if self._debug_summaries:
            actions_list = tf.nest.flatten(actions)
            show_action_index = len(actions_list) != 1
            for i, single_action in enumerate(actions_list):
                action_name = ('actions_{}'.format(i)
                               if show_action_index else 'actions')
                tf.compat.v2.summary.histogram(name=action_name,
                                               data=single_action,
                                               step=self.train_step_counter)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                # Only save debug summaries for first and last epochs.
                debug_summaries = (self._debug_summaries
                                   and (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, self.train_step_counter, debug_summaries)

                variables_to_train = (self._actor_net.trainable_weights +
                                      self._value_net.trainable_weights)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                # If summarize_gradients, create functions for summarizing both
                # gradients and variables.
                if self._summarize_grads_and_vars and debug_summaries:
                    eager_utils.add_gradients_summaries(
                        grads_and_vars, self.train_step_counter)
                    eager_utils.add_variables_summaries(
                        grads_and_vars, self.train_step_counter)

                self._optimizer.apply_gradients(
                    grads_and_vars, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        # Compute the mean kl from previous action distribution.
        kl_divergence = self._kl_divergence(
            time_steps, action_distribution_parameters,
            self._collect_policy.distribution(time_steps, policy_state).action)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.get_epoch_loss.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.compat.v2.summary.scalar(name='policy_gradient_loss',
                                        data=total_policy_gradient_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='value_estimation_loss',
                                        data=total_value_estimation_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='l2_regularization_loss',
                                        data=total_l2_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='entropy_regularization_loss',
                                        data=total_entropy_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='kl_penalty_loss',
                                        data=total_kl_penalty_loss,
                                        step=self.train_step_counter)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.compat.v2.summary.scalar(name='total_abs_loss',
                                        data=total_abs_loss,
                                        step=self.train_step_counter)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.compat.v2.summary.histogram(
                        name=var.name.replace(':', '_'),
                        data=var,
                        step=self.train_step_counter)

        return loss_info