Ejemplo n.º 1
0
  def _train(self, experience, weights):
    rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
        experience.reward, self._time_step_spec.reward)
    actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
        experience.action, self._action_spec)
    observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
        experience.observation, self._time_step_spec.observation)
    if self._observation_and_action_constraint_splitter is not None:
      observations, _ = self._observation_and_action_constraint_splitter(
          observations)

    with tf.GradientTape() as tape:
      loss_info = self.loss(observations,
                            actions,
                            rewards,
                            weights=weights,
                            training=True)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    self.compute_summaries(loss_info.loss)
    variables_to_train = self._reward_network.trainable_weights
    if not variables_to_train:
      logging.info('No variable to train in the agent.')
      return loss_info

    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    training_lib.apply_gradients(self._optimizer, grads_and_vars,
                                 global_step=self.train_step_counter)

    return loss_info
Ejemplo n.º 2
0
    def _train(self, experience, weights):
        experience = self._as_trajectory(experience)

        with tf.GradientTape() as tape:
            loss_info = self._loss(experience, weights=weights, training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)
        if not self._accepts_per_arm_features and self._num_samples_list:
            # Compute the number of samples for each action in the current batch.
            actions_flattened = tf.reshape(experience.action, [-1])
            num_samples_per_action_current = [
                tf.reduce_sum(tf.cast(tf.equal(actions_flattened, k),
                                      tf.int64))
                for k in range(self._num_actions)
            ]
            # Update the number of samples for each action.
            for a, b in zip(self._num_samples_list,
                            num_samples_per_action_current):
                tf.compat.v1.assign_add(a, b)

        return loss_info
    def _train(self, experience, weights):
        (observations, actions,
         rewards) = bandit_utils.process_experience_for_neural_agents(
             experience, self._accepts_per_arm_features,
             self.training_data_spec)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
Ejemplo n.º 4
0
    def _train(self, experience, weights):
        time_steps, actions, next_time_steps = self._experience_to_transitions(
            experience)

        with tf.GradientTape() as tape:
            loss_info = self.loss(
                time_steps,
                actions,
                next_time_steps,
                td_errors_loss_fn=self._td_errors_loss_fn,
                gamma=self._gamma,
                reward_scale_factor=self._reward_scale_factor,
                weights=weights)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        variables_to_train = self._q_network.trainable_weights
        assert list(
            variables_to_train), "No variables in the agent's q_network."
        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        self._update_target()

        return loss_info
Ejemplo n.º 5
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        actions = policy_steps_.action

        if self._debug_summaries:
            actions_list = tf.nest.flatten(actions)
            show_action_index = len(actions_list) != 1
            for i, single_action in enumerate(actions_list):
                action_name = ('actions_{}'.format(i)
                               if show_action_index else 'actions')
                tf.compat.v2.summary.histogram(name=action_name,
                                               data=single_action,
                                               step=self.train_step_counter)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                # Only save debug summaries for first and last epochs.
                debug_summaries = (self._debug_summaries
                                   and (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, self.train_step_counter, debug_summaries)

                variables_to_train = (self._actor_net.trainable_weights +
                                      self._value_net.trainable_weights)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                # If summarize_gradients, create functions for summarizing both
                # gradients and variables.
                if self._summarize_grads_and_vars and debug_summaries:
                    eager_utils.add_gradients_summaries(
                        grads_and_vars, self.train_step_counter)
                    eager_utils.add_variables_summaries(
                        grads_and_vars, self.train_step_counter)

                self._optimizer.apply_gradients(
                    grads_and_vars, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        # Compute the mean kl from previous action distribution.
        kl_divergence = self._kl_divergence(
            time_steps, action_distribution_parameters,
            self._collect_policy.distribution(time_steps, policy_state).action)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.get_epoch_loss.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.compat.v2.summary.scalar(name='policy_gradient_loss',
                                        data=total_policy_gradient_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='value_estimation_loss',
                                        data=total_value_estimation_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='l2_regularization_loss',
                                        data=total_l2_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='entropy_regularization_loss',
                                        data=total_entropy_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='kl_penalty_loss',
                                        data=total_kl_penalty_loss,
                                        step=self.train_step_counter)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.compat.v2.summary.scalar(name='total_abs_loss',
                                        data=total_abs_loss,
                                        step=self.train_step_counter)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.compat.v2.summary.histogram(
                        name=var.name.replace(':', '_'),
                        data=var,
                        step=self.train_step_counter)

        return loss_info
Ejemplo n.º 6
0
 def _create_summaries(grads_and_vars):
     eager_utils.add_gradients_summaries(grads_and_vars)
     eager_utils.add_variables_summaries(grads_and_vars)
     grads_and_vars = clip_gradients(grads_and_vars)
     return grads_and_vars
    def _train(self, experience, weights):
        # Q-network training
        with tf.GradientTape() as tape:
            loss_info_q = self._loss(
                experience,
                td_errors_loss_fn=self._td_errors_loss_fn,
                gamma=self._gamma,
                reward_scale_factor=self._reward_scale_factor,
                weights=weights,
                training=True)
        tf.debugging.check_numerics(loss_info_q.loss, 'Loss is inf or nan')
        variables_to_train = self._q_network.trainable_weights
        non_trainable_weights = self._q_network.non_trainable_weights
        assert list(
            variables_to_train), "No variables in the agent's q_network."
        grads = tape.gradient(loss_info_q.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = list(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            grads_and_vars_with_non_trainable = (
                grads_and_vars + [(None, v) for v in non_trainable_weights])
            eager_utils.add_variables_summaries(
                grads_and_vars_with_non_trainable, self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)
        self._optimizer.apply_gradients(grads_and_vars)

        # H-network training
        with tf.GradientTape() as tape:
            loss_info_h = self._loss_h(
                experience,
                td_errors_loss_fn=self._td_errors_loss_fn,
                gamma=self._gamma,
                reward_scale_factor=self._reward_scale_factor,
                weights=weights,
                training=True)
        tf.debugging.check_numerics(loss_info_h.loss, 'Loss is inf or nan')
        variables_to_train_h = self._h_network.trainable_weights
        non_trainable_weights_h = self._h_network.non_trainable_weights
        assert list(
            variables_to_train_h), "No variables in the agent's h_network."
        grads_h = tape.gradient(loss_info_h.loss, variables_to_train_h)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars_h = list(zip(grads_h, variables_to_train_h))
        if self._gradient_clipping is not None:
            grads_and_vars_h = eager_utils.clip_gradient_norms(
                grads_and_vars_h, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            grads_and_vars_with_non_trainable_h = (
                grads_and_vars_h + [(None, v)
                                    for v in non_trainable_weights_h])
            eager_utils.add_variables_summaries(
                grads_and_vars_with_non_trainable_h, self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars_h,
                                                self.train_step_counter)
        self._optimizer.apply_gradients(grads_and_vars_h)
        self.train_step_counter.assign_add(1)

        self._update_target()

        return loss_info_q