Exemple #1
0
    def body(self, features):
        """Transformer main model_fn.

        Args:
          features: Map of features to the model. Should contain the following:
              "inputs": Transformer inputs.
                  [batch_size, input_length, 1, hidden_dim].
              "targets": Target decoder outputs.
                  [batch_size, decoder_length, 1, hidden_dim]
              "target_space_id": A scalar int from data_generators.problem.SpaceID.

        Returns:
          Final decoder representation. [batch_size, decoder_length, hidden_dim]
        """
        #self._hparams.add("warm_start_from",True)
        hparams = self._hparams

        losses = []

        #         if self.has_input:
        #             inputs = features["inputs"]
        #             target_space = features["target_space_id"]
        #             encoder_output, encoder_decoder_attention_bias = self.encode(
        #                 inputs, target_space, hparams, features=features, losses=losses)
        #         else:
        encoder_output, encoder_decoder_attention_bias = (None, None)
        lekeys = "inputs"
        if lekeys in features:
            targets = features["inputs"]
            lekeys = "inputs"
        else:
            targets = features["targets"]
            lekeys = "targets"
        targets_shape = common_layers.shape_list(targets)
        targets = common_layers.flatten4d3d(targets)
        decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
            targets, hparams, features=features)
        decoder_output = self.decode(decoder_input,
                                     encoder_output,
                                     encoder_decoder_attention_bias,
                                     decoder_self_attention_bias,
                                     hparams,
                                     nonpadding=features_to_nonpadding(
                                         features, lekeys),
                                     losses=losses)

        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}

        ret = tf.reshape(decoder_output, targets_shape)
        if losses:
            return ret, {"extra_loss": tf.add_n(losses)}
        else:
            return ret
  def body(self, features):
    """dual_Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs1\2": Transformer inputs [batch_size, input_length, hidden_dim]
          "targets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams

    losses = []
    assert self.has_input, "problems for dual-transformer must have inputs"
    if self.has_input:
      inputs1 = features["wav_inputs"]
      inputs2 = features["txt_inputs"]
      target_space = features["target_space_id"]
      wav_encoder_output, wav_enc_dec_attention_bias,\
      txt_encoder_output, txt_enc_dec_attention_bias = self.dual_encode(
          inputs1, inputs2,target_space, hparams, features=features, losses=losses)
    else:
      wav_encoder_output, wav_enc_dec_attention_bias, \
      txt_encoder_output, txt_enc_dec_attention_bias=(None, None, None, None)

    targets = features["targets"]
    targets_shape = common_layers.shape_list(targets)
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
        targets, hparams, features=features)

    decoder_output = self.dual_decode(
        decoder_input,
        wav_encoder_output,txt_encoder_output,
        wav_enc_dec_attention_bias,
        txt_enc_dec_attention_bias,
        decoder_self_attention_bias,
        hparams,
        nonpadding=features_to_nonpadding(features, "targets"),
        losses=losses)

    expected_attentions = features.get("expected_attentions")
    if expected_attentions is not None:
      attention_loss = common_attention.encoder_decoder_attention_loss(
          expected_attentions, self.attention_weights,
          hparams.expected_attention_loss_type,
          hparams.expected_attention_loss_multiplier)
      return decoder_output, {"attention_loss": attention_loss}

    ret = tf.reshape(decoder_output, targets_shape)
    if losses:
      return ret, {"extra_loss": tf.add_n(losses)}
    else:
      return ret
  def body(self, features):
    """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs.
              [batch_size, input_length, 1, hidden_dim].
          "targets": Target decoder outputs.
              [batch_size, decoder_length, 1, hidden_dim]
          "target_space_id": A scalar int from data_generators.problem.SpaceID.

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams

    losses = []

    if self.has_input:
      raise AttributeError("Context transformer encoder not implemented")
      inputs = features["inputs"]
      target_space = features["target_space_id"]
      encoder_output, encoder_decoder_attention_biases = self.encode(
          inputs, target_space, hparams, features=features, losses=losses)
    else:
      encoder_output, encoder_decoder_attention_biases = (None, None)

    targets = features["targets"]
    targets_shape = common_layers.shape_list(targets)
    targets = common_layers.flatten4d3d(targets)
    decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
        targets, hparams, features=features)
    decoder_self_attention_biases = expand_bias_modes(
        decoder_self_attention_bias, features["targets_seg"])
    decoder_output = self.decode(
        decoder_input,
        encoder_output,
        encoder_decoder_attention_biases,
        decoder_self_attention_biases,
        hparams,
        nonpadding=features_to_nonpadding(features, "targets"),
        losses=losses)

    expected_attentions = features.get("expected_attentions")
    if expected_attentions is not None:
      attention_loss = common_attention.encoder_decoder_attention_loss(
          expected_attentions, self.attention_weights,
          hparams.expected_attention_loss_type,
          hparams.expected_attention_loss_multiplier)
      return decoder_output, {"attention_loss": attention_loss}

    ret = tf.reshape(decoder_output, targets_shape)
    if losses:
      return ret, {"extra_loss": tf.add_n(losses)}
    else:
      return ret
Exemple #4
0
    def body(self, features):
        """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "tragets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
        hparams = self._hparams

        if self.has_input:
            inputs = features["inputs"]
            target_space = features["target_space_id"]
            encoder_output, encoder_decoder_attention_bias = self.encode(
                inputs, target_space, hparams, features=features)
        else:
            encoder_output, encoder_decoder_attention_bias = (None, None)

        targets = features["targets"]
        targets = common_layers.flatten4d3d(targets)

        decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
            targets, hparams, features=features)

        decoder_output = self.decode(decoder_input,
                                     encoder_output,
                                     encoder_decoder_attention_bias,
                                     decoder_self_attention_bias,
                                     hparams,
                                     nonpadding=features_to_nonpadding(
                                         features, "targets"))

        self.cache_flag = tf.py_func(
            self.sentence_cache.AddMultipleEntries,
            [features["targets_raw"], decoder_output],
            tf.float32,
        )

        tf.cast(self.cache_flag, tf.float32)

        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}
        self.cache_flag.set_shape((1, ))
        return decoder_output + 0 * self.cache_flag
    def body(self, features):

        hparams = self._hparams
        losses = []

        contexts = {}
        for feature_name in features:
            if 'context' in feature_name and 'raw' not in feature_name:
                contexts[feature_name] = features[feature_name]
        inputs = features["inputs"]
        target_space = features["target_space_id"]

        encoder_output, encoder_decoder_attention_bias = self.encode(
            inputs,
            contexts,
            target_space,
            hparams=hparams,
            features=features,
            losses=losses)

        targets = features["targets"]
        targets_shape = common_layers.shape_list(targets)
        targets = common_layers.flatten4d3d(targets)

        decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
            targets, hparams, features=features)

        decoder_output = self.decode(decoder_input,
                                     encoder_output,
                                     encoder_decoder_attention_bias,
                                     decoder_self_attention_bias,
                                     hparams=hparams,
                                     nonpadding=features_to_nonpadding(
                                         features, "targets"),
                                     losses=losses)

        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}

        ret = tf.reshape(decoder_output, targets_shape)
        if losses:
            return ret, {"extra_loss": tf.add_n(losses)}
        else:
            return ret
  def body(self, features):
    """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "tragets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams

    if self.has_input:
      inputs = features["inputs"]
      target_space = features["target_space_id"]
      encoder_output, encoder_decoder_attention_bias = self.encode(
          inputs, target_space, hparams, features=features)
    else:
      encoder_output, encoder_decoder_attention_bias = (None, None)

    targets = features["targets"]
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
        targets, hparams, features=features)

    decoder_output = self.decode(
        decoder_input,
        encoder_output,
        encoder_decoder_attention_bias,
        decoder_self_attention_bias,
        hparams,
        nonpadding=features_to_nonpadding(features, "targets"))

    expected_attentions = features.get("expected_attentions")
    if expected_attentions is not None:
      attention_loss = common_attention.encoder_decoder_attention_loss(
          expected_attentions, self.attention_weights)
      return decoder_output, {"attention_loss": attention_loss}

    return decoder_output
  def body(self, features):
    """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "tragets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams

    if self.has_input:
      inputs = features["inputs"]
      target_space = features["target_space_id"]
      encoder_output, encoder_decoder_attention_bias = self.encode(
          inputs, target_space, hparams, features=features)
    else:
      encoder_output, encoder_decoder_attention_bias = (None, None)

    targets = features["targets"]
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
        targets, hparams, features=features)

    decoder_output = self.decode(
        decoder_input,
        encoder_output,
        encoder_decoder_attention_bias,
        decoder_self_attention_bias,
        hparams,
        nonpadding=features_to_nonpadding(features, "targets"))

    expected_attentions = features.get("expected_attentions")
    if expected_attentions is not None:
      attention_loss = common_attention.encoder_decoder_attention_loss(
          expected_attentions, self.attention_weights)
      return decoder_output, {"attention_loss": attention_loss}

    return decoder_output
Exemple #8
0
  def body(self, features):
    """Universal Transformer main model_fn.


    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "targets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams
    if hparams.add_position_timing_signal:
      # Turning off addition of positional embedding in the encoder/decoder
      # preparation as we do it in the beginning of each step.
      hparams.pos = None

    if self.has_input:
      inputs = features["inputs"]
      target_space = features["target_space_id"]
      (encoder_output, encoder_decoder_attention_bias,
       enc_extra_output) = self.encode(
           inputs, target_space, hparams, features=features)
    else:
      (encoder_output, encoder_decoder_attention_bias,
       enc_extra_output) = (None, None, (None, None))

    targets = features["targets"]
    targets = common_layers.flatten4d3d(targets)

    (decoder_input,
     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
         targets, hparams, features=features)

    decoder_output, dec_extra_output = self.decode(
        decoder_input,
        encoder_output,
        encoder_decoder_attention_bias,
        decoder_self_attention_bias,
        hparams,
        nonpadding=transformer.features_to_nonpadding(features, "targets"))

    expected_attentions = features.get("expected_attentions")
    if expected_attentions is not None:
      attention_loss = common_attention.encoder_decoder_attention_loss(
          expected_attentions, self.attention_weights,
          hparams.expected_attention_loss_type,
          hparams.expected_attention_loss_multiplier)
      return decoder_output, {"attention_loss": attention_loss}

    if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0:
      if self.has_input:
        enc_ponder_times, enc_remainders = enc_extra_output
        enc_act_loss = (
            hparams.act_loss_weight *
            tf.reduce_mean(enc_ponder_times + enc_remainders))
      else:
        enc_act_loss = 0.0

      (dec_ponder_times, dec_remainders) = dec_extra_output
      dec_act_loss = (
          hparams.act_loss_weight *
          tf.reduce_mean(dec_ponder_times + dec_remainders))
      act_loss = enc_act_loss + dec_act_loss
      tf.contrib.summary.scalar("act_loss", act_loss)
      return decoder_output, {"act_loss": act_loss}

    return decoder_output
    def body(self, features):
        """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs. [batch_size, input_length, 1,
            hidden_dim].
          "targets": Target decoder outputs. [batch_size, decoder_length, 1,
            hidden_dim]
          "target_space_id": A scalar int from data_generators.problem.SpaceID.

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
        hparams = self._hparams

        losses = []

        if self.has_input:
            # use melody-only as input features
            inputs = features["melody"]
            target_space = features["target_space_id"]
            encoder_output, encoder_decoder_attention_bias = self.encode(
                inputs,
                target_space,
                hparams,
                features=features,
                losses=losses)
        else:
            encoder_output, encoder_decoder_attention_bias = (None, None)

        targets = features["targets"]
        targets_shape = common_layers.shape_list(targets)
        targets = common_layers.flatten4d3d(targets)
        decoder_input, decoder_self_attention_bias = self._prepare_decoder_fn(
            targets, hparams, features=features)

        # Not all subclasses of Transformer support keyword arguments related to
        # recurrent memory, so only pass these arguments if memory is enabled.
        decode_kwargs = {}
        if self.recurrent_memory_by_layer is not None:
            # TODO(kitaev): The chunk_number feature currently has the same shape as
            # "targets", but this is only for the purposes of sharing sharding code.
            # In fact every token within an example must have the same chunk number.
            chunk_number_each_token = tf.squeeze(features["chunk_number"],
                                                 (-1, -2))
            chunk_number_each_example = chunk_number_each_token[:, 0]
            # Uncomment the code below to verify that tokens within a batch share the
            # same chunk number:
            # with tf.control_dependencies([
            #     tf.assert_equal(chunk_number_each_token,
            #                     chunk_number_each_example[:, None])
            # ]):
            #   chunk_number_each_example = tf.identity(chunk_number_each_example)
            decode_kwargs = dict(
                recurrent_memory_by_layer=self.recurrent_memory_by_layer,
                chunk_number=chunk_number_each_example,
            )
        decoder_output = self.decode(decoder_input,
                                     encoder_output,
                                     encoder_decoder_attention_bias,
                                     decoder_self_attention_bias,
                                     hparams,
                                     nonpadding=features_to_nonpadding(
                                         features, "targets"),
                                     losses=losses,
                                     **decode_kwargs)
        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}

        ret = tf.reshape(decoder_output, targets_shape)
        if losses:
            return ret, {"extra_loss": tf.add_n(losses)}
        else:
            return ret
Exemple #10
0
    def body(self, features):
        """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs.
              [batch_size, input_length, 1, hidden_dim].
          "targets": Target decoder outputs.
              [batch_size, decoder_length, 1, hidden_dim]
          "target_space_id": A scalar int from data_generators.problem.SpaceID.

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
        hparams = self._hparams

        losses = []

        if self.has_input:
            inputs = features["inputs"]
            target_space = features["target_space_id"]
            encoder_output, encoder_decoder_attention_bias = self.encode(
                inputs,
                target_space,
                hparams,
                features=features,
                losses=losses)
        else:
            encoder_output, encoder_decoder_attention_bias = (None, None)

        targets = features["targets"]
        targets_shape = common_layers.shape_list(targets)
        targets = common_layers.flatten4d3d(targets)
        left_decoder_input, left_decoder_self_attention_bias = transformer_prepare_decoder(
            targets, hparams, features=features)
        right_decoder_input, right_decoder_self_attention_bias = transformer_prepare_decoder_right(
            targets, hparams, features=features)
        non_pad = nonpadding = features_to_nonpadding(features, "targets")
        with tf.variable_scope("left_decoder"):
            left_decoder_output = self.decode(left_decoder_input,
                                              encoder_output,
                                              encoder_decoder_attention_bias,
                                              left_decoder_self_attention_bias,
                                              hparams,
                                              nonpadding=non_pad,
                                              losses=losses)
        with tf.variable_scope("right_decoder"):
            right_decoder_output = self.decode(
                right_decoder_input,
                encoder_output,
                encoder_decoder_attention_bias,
                right_decoder_self_attention_bias,
                hparams,
                nonpadding=non_pad,
                losses=losses)

        decoder_output = transformer_bidirectional_joint_decoder(
            tf.squeeze(left_decoder_output, axis=2),
            tf.squeeze(right_decoder_output, axis=2),
            encoder_output,
            encoder_decoder_attention_bias,
            hparams,
            nonpadding=non_pad,
            save_weights_to=self.attention_weights,
            losses=losses)
        decoder_output = tf.expand_dims(decoder_output, axis=2)

        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}

        ret = tf.reshape(decoder_output, targets_shape)
        if losses:
            return ret, {"extra_loss": tf.add_n(losses)}
        else:
            return ret
    def body(self, features):
        """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "targets": Target decoder outputs. [batch_size, decoder_length,
            hidden_dim]
          "target_space_id": A scalar int from data_generators.problem.SpaceID.

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
        tf.logging.info("Using PgScratch BODY function.")
        hparams = self._hparams

        losses = {}
        inputs = features["inputs"]
        target_space = features["target_space_id"]
        # encoder_output: <tf.float32>[batch_size, input_length, hidden_dim]
        # encoder_decoder_attention_bias: <tf.float32>[batch_size, input_length]
        encoder_output, encoder_decoder_attention_bias = self.encode(
            inputs, target_space, hparams, features=features, losses=losses)

        with tf.variable_scope("knowledge"):
            with tf.name_scope("knowledge_encoding"):
                # Encode knowledge.
                # <tf.float32>[batch_size, triple_num, emb_dim]
                fact_embedding, fact_lengths = self.encode_knowledge_bottom(
                    features)
                tf.logging.info("Encoded knowledge")

            with tf.name_scope("knowledge_selection_and_loss"):
                # Compute knowledge selection and loss.
                triple_logits, avg_triple_selection_loss, knowledge_encoder_output, transe_loss = self.compute_knowledge_selection_and_loss(
                    features, encoder_output, fact_embedding, fact_lengths,
                    hparams.margin, hparams.num_negative_samples)
                losses["kb_loss"] = avg_triple_selection_loss
                losses["transe_loss"] = transe_loss

        if hparams.attend_kb:
            tf.logging.info("ATTEND_KB is ACTIVE")
            with tf.name_scope("knowledge_attention"):

                knowledge_padding = tf.zeros_like(triple_logits,
                                                  dtype=tf.float32)
                knowledge_attention_bias = common_attention.attention_bias_ignore_padding(
                    knowledge_padding)
                encoder_output = tf.concat(
                    [knowledge_encoder_output, encoder_output], 1)
                encoder_decoder_attention_bias = tf.concat(
                    [knowledge_attention_bias, encoder_decoder_attention_bias],
                    -1)

        else:
            tf.logging.info("ATTEND_KB is INACTIVE")

        targets = features["targets"]
        targets_shape = common_layers.shape_list(targets)
        targets = common_layers.flatten4d3d(targets)

        (decoder_input, decoder_self_attention_bias
         ) = transformer.transformer_prepare_decoder(targets,
                                                     hparams,
                                                     features=features)

        decode_kwargs = {}
        decoder_output = self.decode(
            decoder_input,
            encoder_output,
            encoder_decoder_attention_bias,
            decoder_self_attention_bias,
            hparams,
            nonpadding=transformer.features_to_nonpadding(features, "targets"),
            losses=losses,
            **decode_kwargs)

        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}

        ret = tf.reshape(decoder_output, targets_shape)
        if losses:
            return ret, losses
        else:
            return ret
    def body(self, features):
        """CopyTransformer main model_fn.

        Args:
          features: Map of features to the model. Should contain the following:
              "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
              "targets": Target decoder outputs.
                  [batch_size, decoder_length, hidden_dim]
              "targets_*": Additional decoder outputs to generate, for copying
                  and pointing; [batch_size, decoder_length]
              "target_space_id": A scalar int from data_generators.problem.SpaceID.

        Returns:
          Final decoder representation. [batch_size, decoder_length, hidden_dim]
        """
        hparams = self._hparams

        losses = []

        inputs = features["inputs"]

        target_space = features["target_space_id"]
        encoder_output, encoder_decoder_attention_bias = self.encode(
            inputs, target_space, hparams, features=features, losses=losses)

        if "targets_actions" in features:
            targets = features["targets_actions"]
        else:
            tf.logging.warn(
                "CopyTransformer must be used with a SemanticParsing problem with a ShiftReduceGrammar; bad things will happen otherwise"
            )
            targets = features["targets"]

        targets_shape = common_layers.shape_list(targets)

        targets = common_layers.flatten4d3d(targets)

        decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
            targets, hparams, features=features)

        decoder_output = self.decode(decoder_input,
                                     encoder_output,
                                     encoder_decoder_attention_bias,
                                     decoder_self_attention_bias,
                                     hparams,
                                     nonpadding=features_to_nonpadding(
                                         features, "targets"),
                                     losses=losses)

        expected_attentions = features.get("expected_attentions")
        if expected_attentions is not None:
            attention_loss = common_attention.encoder_decoder_attention_loss(
                expected_attentions, self.attention_weights,
                hparams.expected_attention_loss_type,
                hparams.expected_attention_loss_multiplier)
            return decoder_output, {"attention_loss": attention_loss}

        decoder_output = tf.reshape(decoder_output, targets_shape)

        body_output = dict()
        target_modality = self._problem_hparams.target_modality \
            if self._problem_hparams else {"targets": None}

        assert hparams.pointer_layer in ("attentive", "decaying_attentive")

        for key, modality in target_modality.items():
            if isinstance(modality, CopyModality):
                with tf.variable_scope("copy_layer/" + key):
                    if hparams.pointer_layer == "decaying_attentive":
                        output_layer = DecayingAttentivePointerLayer(
                            encoder_output)
                    else:
                        output_layer = AttentivePointerLayer(encoder_output)
                    scores = output_layer(decoder_output)
                    scores += encoder_decoder_attention_bias
                    body_output[key] = scores
            else:
                body_output[key] = decoder_output

        if losses:
            return body_output, {"extra_loss": tf.add_n(losses)}
        else:
            return body_output
  def body(self, features):
    """Universal Transformer main model_fn.


    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "targets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams
    if hparams.add_position_timing_signal:
      # Turning off addition of positional embedding in the encoder/decoder
      # preparation as we do it in the beginning of each step.
      hparams.pos = None

    if self.has_input:
      inputs = features["inputs"]
      target_space = features["target_space_id"]
      (encoder_output, encoder_decoder_attention_bias,
       enc_extra_output) = self.encode(
           inputs, target_space, hparams, features=features)
    else:
      (encoder_output, encoder_decoder_attention_bias,
       enc_extra_output) = (None, None, (None, None))

    targets = features["targets"]
    targets = common_layers.flatten4d3d(targets)

    (decoder_input,
     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
         targets, hparams, features=features)

    decoder_output, dec_extra_output = self.decode(
        decoder_input,
        encoder_output,
        encoder_decoder_attention_bias,
        decoder_self_attention_bias,
        hparams,
        nonpadding=transformer.features_to_nonpadding(features, "targets"))

    expected_attentions = features.get("expected_attentions")
    if expected_attentions is not None:
      attention_loss = common_attention.encoder_decoder_attention_loss(
          expected_attentions, self.attention_weights,
          hparams.expected_attention_loss_type,
          hparams.expected_attention_loss_multiplier)
      return decoder_output, {"attention_loss": attention_loss}

    if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0:
      if self.has_input:
        enc_ponder_times, enc_remainders = enc_extra_output
        enc_act_loss = (
            hparams.act_loss_weight *
            tf.reduce_mean(enc_ponder_times + enc_remainders))
      else:
        enc_act_loss = 0.0

      (dec_ponder_times, dec_remainders) = dec_extra_output
      dec_act_loss = (
          hparams.act_loss_weight *
          tf.reduce_mean(dec_ponder_times + dec_remainders))
      act_loss = enc_act_loss + dec_act_loss
      tf.contrib.summary.scalar("act_loss", act_loss)
      return decoder_output, {"act_loss": act_loss}

    return decoder_output