Exemple #1
0
    def _train(self, experience, weights=None, train_step_counter=None):
        # TODO(sfishman): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')
        experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                           experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.contrib.summary.histogram('rewards', experience.reward)
            tf.contrib.summary.histogram('discounts', experience.discount)
            tf.contrib.summary.histogram('returns', returns)

        # TODO(kbnaoop): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.contrib.summary.histogram('normalized_returns', returns)

        # TODO(kbanoop): remove after changing network interface to accept
        # observations and step_types, instead of time_steps.
        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)
        # TODO(kbanoop): Filter boundary steps.

        loss_info = self._loss(time_step,
                               experience.action,
                               tf.stop_gradient(returns),
                               weights=weights)

        clip_gradients = None
        if self._gradient_clipping:
            clip_gradients = eager_utils.clip_gradient_norms_fn(
                self._gradient_clipping)

        loss_info = eager_utils.create_train_step(
            loss_info,
            self._optimizer,
            total_loss_fn=lambda loss_info: loss_info.loss,
            global_step=train_step_counter,
            transform_grads_fn=clip_gradients,
            summarize_gradients=self._summarize_grads_and_vars,
            variables_to_train=lambda: self._actor_network.trainable_weights,
        )

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                for var in self._actor_network.trainable_weights:
                    tf.contrib.summary.histogram(var.name.replace(':', '_'),
                                                 var)

        return loss_info
Exemple #2
0
  def testClipGradsFn(self):
    xs = tf.Variable(0.0)
    grads = tf.constant(4.0)
    gradients_to_variables = [(grads, xs)]
    clipped_gradients_to_variables = eager_utils.clip_gradient_norms_fn(3.0)(
        gradients_to_variables)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAlmostEqual(4.0, self.evaluate(gradients_to_variables[0][0]))
    self.assertAlmostEqual(3.0,
                           self.evaluate(clipped_gradients_to_variables[0][0]))
Exemple #3
0
        def clip_and_summarize_gradients(grads_and_vars):
            """Clips gradients, and summarizes gradients and variables."""
            if self._gradient_clipping is not None:
                grads_and_vars = eager_utils.clip_gradient_norms_fn(
                    self._gradient_clipping)(grads_and_vars)

            if self._summarize_grads_and_vars:
                # TODO(kbanoop): Move gradient summaries to train_op after we switch to
                # eager train op, and move variable summaries to critic_loss.
                for grad, var in grads_and_vars:
                    with tf.name_scope('Gradients/'):
                        if grad is not None:
                            tf.contrib.summary.histogram(grad.op.name, grad)
                    with tf.name_scope('Variables/'):
                        if var is not None:
                            tf.contrib.summary.histogram(var.op.name, var)
            return grads_and_vars
Exemple #4
0
    def _train(self, experience, weights=None):
        loss_info = self._loss(experience, weights=weights)

        transform_grads_fn = None
        if self._gradient_clipping is not None:
            transform_grads_fn = eager_utils.clip_gradient_norms_fn(
                self._gradient_clipping)

        loss_info = eager_utils.create_train_step(
            loss_info,
            self._optimizer,
            total_loss_fn=lambda loss_info: loss_info.loss,
            global_step=self.train_step_counter,
            transform_grads_fn=transform_grads_fn,
            summarize_gradients=self._summarize_grads_and_vars,
            variables_to_train=lambda: self._cloning_network.trainable_weights,
        )

        return loss_info
Exemple #5
0
  def _train(self, experience, train_step_counter=None, weights=None):
    time_steps, actions, next_time_steps = self._experience_to_transitions(
        experience)

    loss_info = self._loss(
        time_steps,
        actions,
        next_time_steps,
        td_errors_loss_fn=self._td_errors_loss_fn,
        gamma=self._gamma,
        reward_scale_factor=self._reward_scale_factor,
        weights=weights)

    transform_grads_fn = None
    if self._gradient_clipping is not None:
      transform_grads_fn = eager_utils.clip_gradient_norms_fn(
          self._gradient_clipping)

    loss_info = eager_utils.create_train_step(
        loss_info,
        self._optimizer,
        total_loss_fn=lambda loss_info: loss_info.loss,
        global_step=train_step_counter,
        transform_grads_fn=transform_grads_fn,
        summarize_gradients=self._summarize_grads_and_vars,
        variables_to_train=lambda: self._q_network.trainable_weights,
    )

    # Make sure the update_targets periodically object is only created once.
    if self._target_update_train_op is None:
      with tf.control_dependencies([loss_info.loss]):
        self._target_update_train_op = self._update_targets(
            self._target_update_tau, self._target_update_period)

    with tf.control_dependencies([self._target_update_train_op]):
      loss_info = tf.nest.map_structure(
          lambda t: tf.identity(t, name='loss_info'), loss_info)

    return loss_info
Exemple #6
0
    def build_train_op(self, time_steps, actions, act_log_probs, returns,
                       normalized_advantages, action_distribution_parameters,
                       weights, train_step, summarize_gradients,
                       gradient_clipping, debug_summaries):
        """Compute the loss and create optimization op for one training epoch.

    All tensors should have a single batch dimension.

    Args:
      time_steps: A minibatch of TimeStep tuples.
      actions: A minibatch of actions.
      act_log_probs: A minibatch of action probabilities (probability under the
        sampling policy).
      returns: A minibatch of per-timestep returns.
      normalized_advantages: A minibatch of normalized per-timestep advantages.
      action_distribution_parameters: Parameters of data-collecting action
        distribution. Needed for KL computation.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.  Includes a mask for invalid timesteps.
      train_step: A train_step variable to increment for each train step.
        Typically the global_step.
      summarize_gradients: If true, gradient summaries will be written.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: True if debug summaries should be created.

    Returns:
      A tf_agent.LossInfo named tuple with the total_loss and all intermediate
        losses in the extra field contained in a PPOLossInfo named tuple.
    """
        # Evaluate the current policy on timesteps.

        # batch_size from time_steps
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        distribution_step = self._collect_policy.distribution(
            time_steps, policy_state)
        # TODO(eholly): Rename policy distributions to something clear and uniform.
        current_policy_distribution = distribution_step.action

        # Call all loss functions and add all loss values.
        value_estimation_loss = self.value_estimation_loss(
            time_steps, returns, weights, debug_summaries)
        policy_gradient_loss = self.policy_gradient_loss(
            time_steps,
            actions,
            tf.stop_gradient(act_log_probs),
            tf.stop_gradient(normalized_advantages),
            current_policy_distribution,
            weights,
            debug_summaries=debug_summaries)

        if self._policy_l2_reg > 0.0 or self._value_function_l2_reg > 0.0:
            l2_regularization_loss = self.l2_regularization_loss(
                debug_summaries)
        else:
            l2_regularization_loss = tf.zeros_like(policy_gradient_loss)

        if self._entropy_regularization > 0.0:
            entropy_regularization_loss = self.entropy_regularization_loss(
                time_steps, current_policy_distribution, weights,
                debug_summaries)
        else:
            entropy_regularization_loss = tf.zeros_like(policy_gradient_loss)

        kl_penalty_loss = self.kl_penalty_loss(time_steps,
                                               action_distribution_parameters,
                                               current_policy_distribution,
                                               weights, debug_summaries)

        total_loss = (policy_gradient_loss + value_estimation_loss +
                      l2_regularization_loss + entropy_regularization_loss +
                      kl_penalty_loss)

        if gradient_clipping > 0:
            clip_gradients = eager_utils.clip_gradient_norms_fn(
                gradient_clipping)
        else:
            clip_gradients = lambda x: x

        # If summarize_gradients, create functions for summarizing both gradients
        # and variables.
        if summarize_gradients and debug_summaries:

            def _create_summaries(grads_and_vars):
                eager_utils.add_gradients_summaries(grads_and_vars,
                                                    self.train_step_counter)
                eager_utils.add_variables_summaries(grads_and_vars,
                                                    self.train_step_counter)
                grads_and_vars = clip_gradients(grads_and_vars)
                return grads_and_vars

            transform_grads_fn = _create_summaries
        else:
            transform_grads_fn = clip_gradients

        total_loss = eager_utils.create_train_op(
            total_loss,
            self._optimizer,
            global_step=train_step,
            transform_grads_fn=transform_grads_fn,
            variables_to_train=(self._actor_net.trainable_weights +
                                self._value_net.trainable_weights))

        return tf_agent.LossInfo(
            total_loss,
            PPOLossInfo(
                policy_gradient_loss=policy_gradient_loss,
                value_estimation_loss=value_estimation_loss,
                l2_regularization_loss=l2_regularization_loss,
                entropy_regularization_loss=entropy_regularization_loss,
                kl_penalty_loss=kl_penalty_loss,
            ))