def attention_lm_moe_prepare_decoder(targets, hparams):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
    pad_remover (expert_utils.PadRemover): an util object to remove padding
  """
  targets_pad_mask = common_attention.embedding_to_padding(targets)
  with tf.name_scope("pad_remover"):
    # Because of the shift_right, the <eos> token will be considered as
    # padding. In practice, it doesn't really matter, due to the triangular
    # mask, this token should never be attended.
    pad_remover = expert_utils.PadRemover(targets_pad_mask)

  if hparams.prepend_mode == "prepend_inputs_full_attention":
    decoder_self_attention_bias = (
        common_attention.attention_bias_prepend_inputs_full_attention(
            targets_pad_mask))
  else:
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias, pad_remover)
def prepare_image_question_encoder(image_feat, question, hparams):
  """Prepare encoder.

  Args:
    image_feat: a Tensor.
    question: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """

  encoder_input = tf.concat([image_feat, question], axis=1)
  encoder_padding = common_attention.embedding_to_padding(encoder_input)
  ignore_padding = common_attention.attention_bias_ignore_padding(
      encoder_padding)
  encoder_self_attention_bias = ignore_padding
  encoder_decoder_attention_bias = ignore_padding
  # Usual case - not a packed dataset.
  if hparams.pos == "timing":
    question = common_attention.add_timing_signal_1d(question)
  elif hparams.pos == "emb":
    question = common_attention.add_positional_embedding(
        question, hparams.max_length, "inputs_positional_embedding",
        None)
  encoder_input = tf.concat([image_feat, question], axis=1)

  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #3
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  if features and "inputs_segmentation" in features:
    # Packed dataset.  Keep the examples from seeing each other.
    inputs_segmentation = features["inputs_segmentation"]
    inputs_position = features["inputs_position"]
    targets_segmentation = features["targets_segmentation"]
    encoder_self_attention_bias = common_attention.attention_bias_same_segment(
        inputs_segmentation, inputs_segmentation)
    encoder_decoder_attention_bias = (
        common_attention.attention_bias_same_segment(
            targets_segmentation, inputs_segmentation))
  else:
    # Usual case - not a packed dataset.
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    inputs_position = None
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(inputs)[1])
  # Append target_space_id embedding to inputs.
  emb_target_space = common_layers.embedding(
      target_space, 32, ishape_static[-1], name="target_space_embedding")
  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
  encoder_input += emb_target_space
  if hparams.pos == "timing":
    if inputs_position is not None:
      encoder_input = common_attention.add_timing_signal_1d_given_position(
          encoder_input, inputs_position)
    else:
      encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #4
0
def question_encoder(question, hparams, name="encoder"):
  """Question encoder, run LSTM encoder and get the last output as encoding."""
  with tf.variable_scope(name, "encoder", values=[question]):
    question = common_layers.flatten4d3d(question)
    padding = common_attention.embedding_to_padding(question)
    length = common_attention.padding_to_length(padding)

    max_question_length = hparams.max_question_length
    question = question[:, :max_question_length, :]
    actual_question_length = common_layers.shape_list(question)[1]
    length = tf.minimum(length, max_question_length)
    padding = [[0, 0],
               [0, max_question_length-actual_question_length],
               [0, 0]]
    question = tf.pad(question, padding)
    question_shape = question.get_shape().as_list()
    question_shape[1] = max_question_length
    question.set_shape(question_shape)

    # apply tanh dropout on question embedding
    question = tf.tanh(question)
    question = tf.nn.dropout(question, keep_prob=1.-hparams.dropout)

    question = [question[:, i, :] for i in range(max_question_length)]

    # rnn_layers = [_get_rnn_cell(hparams)
    #               for _ in range(hparams.num_rnn_layers)]
    # rnn_multi_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
    rnn_cell = _get_rnn_cell(hparams)
    # outputs, _ = tf.nn.dynamic_rnn(
    #     rnn_cell, question, length, dtype=tf.float32)
    _, state = tf.nn.static_rnn(rnn_cell, question, sequence_length=length,
                                dtype=tf.float32)
    # outputs = [tf.expand_dims(output, axis=1) for output in outputs]
    # outputs = tf.concat(outputs, axis=1)

    # utils.collect_named_outputs("vqa_attention_debug", "question_output",
    #                             outputs)
    # utils.collect_named_outputs("vqa_attention_debug", "question_state",
    #                             state.h)

    # batch_size = common_layers.shape_list(outputs)[0]
    # row_indices = tf.range(batch_size)
    # # length - 1 as index
    # indices = tf.transpose([row_indices, tf.maximum(length-1, 0)])
    # last_output = tf.gather_nd(outputs, indices)

    # utils.collect_named_outputs("vqa_attention_debug",
    #                             "question_final_output", last_output)

  return state.h
Example #5
0
def transformer_prepare_decoder(targets, hparams, features=None):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  if hparams.prepend_mode == "prepend_inputs_full_attention":
    decoder_self_attention_bias = (
        common_attention.attention_bias_prepend_inputs_full_attention(
            common_attention.embedding_to_padding(targets)))
  else:
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(
            common_layers.shape_list(targets)[1]))

  if features and "targets_segmentation" in features:
    # "Packed" dataset - keep the examples from seeing each other.
    targets_segmentation = features["targets_segmentation"]
    targets_position = features["targets_position"]
    decoder_self_attention_bias += common_attention.attention_bias_same_segment(
        targets_segmentation, targets_segmentation)
  else:
    targets_position = None
  if hparams.proximity_bias:
    decoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(targets)[1])
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    if targets_position is not None:
      decoder_input = common_attention.add_timing_signal_1d_given_position(
          decoder_input, targets_position)
    else:
      decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
def prepare_question_encoder(inputs, hparams):
  """Prepare question encoder.

  Args:
    inputs: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  encoder_input = inputs
  # Usual case - not a packed dataset.
  encoder_padding = common_attention.embedding_to_padding(encoder_input)
  ignore_padding = common_attention.attention_bias_ignore_padding(
      encoder_padding)
  encoder_self_attention_bias = ignore_padding
  if hparams.pos == "timing":
    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  elif hparams.pos == "emb":
    encoder_input = common_attention.add_positional_embedding(
        encoder_input, hparams.max_length, "inputs_positional_embedding",
        None)
  return (encoder_input, encoder_self_attention_bias)
Example #7
0
def attention_lm_prepare_decoder(targets, hparams):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
  """
  if hparams.prepend_mode == "prepend_inputs_full_attention":
    decoder_self_attention_bias = (
        common_attention.attention_bias_prepended(
            common_attention.embedding_to_padding(targets)))
  else:
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(
            common_layers.shape_list(targets)[1]))
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
  def testPadRemover(self):
    """Check that the padding remover is working correctly."""
    x_1 = tf.constant([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9],
        [0, 0, 0],  # pad
        [0, 0, 0],  # pad
        [0, 0, 0],  # pad
        [10, 11, 12],
        [13, 14, 15],
        [0, 0, 0],  # pad
    ], dtype=tf.float32)
    # Get padding mask
    x_pad_mask = common_attention.embedding_to_padding(x_1)
    x_2 = tf.constant([
        [1],
        [2],
        [3],
        [4],  # pad
        [5],  # pad
        [6],  # pad
        [7],
        [8],
        [9],  # pad
    ], dtype=tf.float32)
    x_3 = tf.constant([
        1,
        2,
        3,
        4,  # pad
        5,  # pad
        6,  # pad
        7,
        8,
        9,  # pad
    ], dtype=tf.float32)

    pad_remover = expert_utils.PadRemover(x_pad_mask)

    y_1 = pad_remover.remove(x_1)
    y_2 = pad_remover.remove(x_2)
    y_3 = pad_remover.remove(x_3)

    z_1 = pad_remover.restore(y_1 * 2)
    z_2 = pad_remover.restore(y_2 * 2)
    z_3 = pad_remover.restore(y_3 * 2)

    with self.test_session() as sess:
      # Padding should have been removed
      self._verify_value(sess, y_1, [
          [1., 2., 3.],
          [4., 5., 6.],
          [7., 8., 9.],
          [10., 11., 12.],
          [13., 14., 15.],
      ])
      self._verify_value(sess, y_2, [
          [1.],
          [2.],
          [3.],
          [7.],
          [8.],
      ])
      self._verify_value(sess, y_3, [
          1.,
          2.,
          3.,
          7.,
          8.,
      ])

      # Padding should have been restored
      self._verify_value(sess, z_1, [
          [2., 4., 6.],
          [8., 10., 12.],
          [14., 16, 18.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [20., 22., 24.],
          [26., 28., 30.],
          [0., 0., 0.],
      ])
      self._verify_value(sess, z_2, [
          [2.],
          [4.],
          [6.],
          [0.],  # pad
          [0.],  # pad
          [0.],  # pad
          [14.],
          [16.],
          [0.],  # pad
      ])
      self._verify_value(sess, z_3, [
          2.,
          4.,
          6.,
          0.,  # pad
          0.,  # pad
          0.,  # pad
          14.,
          16.,
          0.,  # pad
      ])
  def bottom(self, x):
    """Use batchnorm instead of CMVN and shorten the stft with strided convs.

    Args:
      x: float32 tensor with shape [batch_size, len, 1, freqs * channels]

    Returns:
      float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
    """
    inputs = x
    p = self._model_hparams

    num_mel_bins = p.audio_num_mel_bins
    num_channels = 3 if p.audio_add_delta_deltas else 1

    with tf.variable_scope(self.name):
      if p.audio_preproc_in_bottom:
        # Compute filterbanks
        with tf.variable_scope("fbanks"):
          waveforms = tf.squeeze(inputs, [2, 3])
          mel_fbanks = compute_mel_filterbank_features(
              waveforms,
              sample_rate=p.audio_sample_rate,
              dither=p.audio_dither,
              preemphasis=p.audio_preemphasis,
              frame_length=p.audio_frame_length,
              frame_step=p.audio_frame_step,
              lower_edge_hertz=p.audio_lower_edge_hertz,
              upper_edge_hertz=p.audio_upper_edge_hertz,
              num_mel_bins=p.audio_num_mel_bins,
              apply_mask=True)
          if p.audio_add_delta_deltas:
            mel_fbanks = add_delta_deltas(mel_fbanks)
          x = tf.reshape(mel_fbanks,
                         common_layers.shape_list(mel_fbanks)[:2] +
                         [num_mel_bins, num_channels])

          nonpadding_mask = 1. - common_attention.embedding_to_padding(x)
          num_of_nonpadding_elements = tf.reduce_sum(
              nonpadding_mask) * num_mel_bins * num_channels

          # This replaces CMVN estimation on data
          var_epsilon = 1e-09
          mean = tf.reduce_sum(
              x, axis=[1], keepdims=True) / num_of_nonpadding_elements
          variance = (num_of_nonpadding_elements * mean**2. -
                      2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) +
                      tf.reduce_sum(x**2, axis=[1], keepdims=True)
                     ) / num_of_nonpadding_elements
          x = (x - mean) * tf.rsqrt(variance + var_epsilon) * tf.expand_dims(
              nonpadding_mask, -1)
      else:
        x = inputs

      # The convention is that the models are flattened along the spatial,
      # dimensions, thus the speech preprocessor treats frequencies and
      # channels as image colors (last axis)
      x.set_shape([None, None, num_mel_bins, num_channels])

      # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
      x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]])
      for _ in range(2):
        x = tf.layers.conv2d(
            x, 128, (3, 3), (2, 2), use_bias=False)
        x = common_layers.layer_norm(x)
        x = tf.nn.relu(x)

      xshape = common_layers.shape_list(x)
      # apply a conv that will remove all frequencies and at the same time
      # project the output into desired hidden_size
      x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
      x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False)

      assert common_layers.shape_list(x)[2] == 1
      x = common_layers.layer_norm(x)
      x = tf.nn.relu(x)
    return x
Example #10
0
  def _fast_decode(self,
                   features,
                   decode_length,
                   beam_size=1,
                   top_beams=1,
                   alpha=1.0):
    """Fast decoding.

    Implements both greedy and beam search decoding, uses beam search iff
    beam_size > 1, otherwise beam search related arguments are ignored.

    Args:
      features: a map of string to model  features.
      decode_length: an integer.  How many additional timesteps to decode.
      beam_size: number of beams.
      top_beams: an integer. How many of the beams to return.
      alpha: Float that controls the length penalty. larger the alpha, stronger
        the preference for slonger translations.

    Returns:
       samples: an integer `Tensor`. Top samples from the beam search

    Raises:
      NotImplementedError: If there are multiple data shards.
    """
    if self._num_datashards != 1:
        raise NotImplementedError("Fast decoding only supports a single shard.")
    dp = self._data_parallelism
    hparams = self._hparams

    inputs = features["inputs"]
    firstP = features["firstP"]
    imageP = features["imageP"]
    
    #JI: set image shapes
    imageP.set_shape([None, 27250])
    imageP=tf.reshape(imageP,[-1, img_dim, 50])
    
    batch_size = tf.shape(inputs)[0]
    target_modality = self._problem_hparams.target_modality
    if t2t_model.is_class_modality(target_modality):
        decode_length = 1
    else:
        decode_length = tf.shape(inputs)[1] + decode_length

    # TODO(llion): Clean up this reshaping logic.
    # @authors: what are U doing ?
    inputs = tf.expand_dims(inputs, axis=1)
    if len(inputs.shape) < 5:
        inputs = tf.expand_dims(inputs, axis=4)
    s = tf.shape(inputs)
    inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]])

    firstP = tf.expand_dims(firstP, axis=1)
    if len(firstP.shape) < 5:
        firstP = tf.expand_dims(firstP, axis=4)
    z = tf.shape(firstP)
    firstP = tf.reshape(firstP, [z[0] * z[1], z[2], z[3], z[4]])

    # _shard_features called to ensure that the variable names match
    inputs = self._shard_features({"inputs": inputs})["inputs"]

    # deal with the encoder
    input_modality = self._problem_hparams.input_modality["inputs"]
    with tf.variable_scope(input_modality.name):
        inputs = input_modality.bottom_sharded(inputs, dp)
    with tf.variable_scope("body"):
        #JI: pass images to encoder if needed
        encoder_output, encoder_decoder_attention_bias = dp(
            self.encode, inputs, features["target_space_id"], hparams, imageP=imageP)
    encoder_output = encoder_output[0]
    encoder_decoder_attention_bias = encoder_decoder_attention_bias[0]

    # deal with the first pass decoder
    def preprocess_firstP(firstP):
        firstP = self._shard_features({"firstP": firstP})["firstP"]
        firstP_modality = self._problem_hparams.input_modality["firstP"]
        with tf.variable_scope(firstP_modality.name):
            firstP = firstP_modality.targets_bottom_sharded(firstP, dp)[0]
        firstP = common_layers.flatten4d3d(firstP)
        if hparams.pos == "timing":
            firstP = common_attention.add_timing_signal_1d(firstP)
        return firstP

    firstP = preprocess_firstP(firstP)
    firstPdecoder_padding = common_attention.embedding_to_padding(firstP)
    firstP_delib_attention_bias = common_attention.attention_bias_ignore_padding(firstPdecoder_padding)
    firstP_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(tf.shape(firstP)[1]))
    if hparams.proximity_bias:
        firstP_self_attention_bias += common_attention.attention_bias_proximal(tf.shape(firstP)[1])

    if hparams.pos == "timing":
      timing_signal = common_attention.get_timing_signal_1d(decode_length + 1, hparams.hidden_size)
    
    #JI: get visual attention bias
    img_encoder_padding = common_attention.embedding_to_padding(imageP)
    imageP_self_attention_bias = common_attention.attention_bias_ignore_padding(img_encoder_padding)
    

    def preprocess_targets(targets, i):
        """Performs preprocessing steps on the targets to prepare for the decoder.

        This includes:
          - Embedding the ids.
          - Flattening to 3D tensor.
          - Optionally adding timing signals.

        Args:
          targets: inputs ids to the decoder. [batch_size, 1]
          i: scalar, Step number of the decoding loop.

        Returns:
          Processed targets [batch_size, 1, hidden_dim]
        """
        # _shard_features called to ensure that the variable names match
        targets = self._shard_features({"targets": targets})["targets"]
        with tf.variable_scope(target_modality.name,reuse=True):
            targets = target_modality.targets_bottom_sharded(targets, dp)[0]
        targets = common_layers.flatten4d3d(targets)

        # TODO(llion): Explain! Is this even needed?
        targets = tf.cond(
            tf.equal(i, 0), lambda: tf.zeros_like(targets), lambda: targets)

        if hparams.pos == "timing":
            targets += timing_signal[:, i:i + 1]
        return targets

    # this is actually for the delib-decoder, i.e., the 2nd-pass decoder
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(decode_length))
    if hparams.proximity_bias:
        decoder_self_attention_bias += common_attention.attention_bias_proximal(decode_length)

    key_channels = hparams.attention_key_channels or hparams.hidden_size
    value_channels = hparams.attention_value_channels or hparams.hidden_size
    num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers

    cache = {
        "layer_%d" % layer: {
            "k": tf.zeros([batch_size, 0, key_channels]),
            "v": tf.zeros([batch_size, 0, value_channels]),
        }
        for layer in range(num_layers)
        }

    # Set 2nd dim to None since it's not invariant in the tf.while_loop
    # Note: Tensor.set_shape() does not work here since it merges shape info.
    # TODO(llion); Find a more robust solution.
    # pylint: disable=protected-access
    for layer in cache:
        cache[layer]["k"]._shape = tf.TensorShape([None, None, key_channels])
        cache[layer]["v"]._shape = tf.TensorShape([None, None, value_channels])
    # pylint: enable=protected-access
    cache["encoder_output"] = encoder_output
    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

    with tf.variable_scope("body"):
        firstP_hidden = dp(transformer_decoder, firstP, encoder_output, firstP_self_attention_bias,
                            encoder_decoder_attention_bias, hparams)
    firstP_input = tf.concat(values=[firstP, firstP_hidden[0]], axis=-1)
    cache["firstP_input"] = firstP_input
    cache["firstP_self_attention_bias"] = firstP_delib_attention_bias
    #JI: add image info to cache
    cache["imageP"] = imageP
    cache["imageP_self_attention_bias"] = imageP_self_attention_bias
   
    def symbols_to_logits_fn(ids, i, cache):
        """Go from ids to logits for next symbol."""
        ids = ids[:, -1:]
        targets = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3)
        targets = preprocess_targets(targets, i)

        bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]

        with tf.variable_scope("body"):
            #JI: pass image info to decoder
            body_outputs = dp(transformer_delibdecoder,
                targets, cache["encoder_output"], cache["firstP_input"], cache["imageP"],
                bias, cache["encoder_decoder_attention_bias"],
                cache["firstP_self_attention_bias"], cache["imageP_self_attention_bias"], hparams, cache)

        with tf.variable_scope(target_modality.name):
            logits = target_modality.top_sharded(body_outputs, None, dp)[0]

        return tf.squeeze(logits, axis=[1, 2, 3]), cache



    if beam_size > 1:  # Beam Search
        target_modality = (
            self._hparams.problems[self._problem_idx].target_modality)
        vocab_size = target_modality.top_dimensionality
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)
        decoded_ids, scores = beam_search.beam_search(
            symbols_to_logits_fn, initial_ids, beam_size, decode_length,
            vocab_size, alpha, states=cache, stop_early=(top_beams == 1))
        
        if top_beams == 1:
            decoded_ids = decoded_ids[:, 0, 1:]
        else:
            decoded_ids = decoded_ids[:, :top_beams, 1:]
    else:  # Greedy

        def inner_loop(i, next_id, decoded_ids, cache):
            logits, cache = symbols_to_logits_fn(next_id, i, cache)
            temperature = (0.0 if hparams.sampling_method == "argmax"
                           else hparams.sampling_temp)
            next_id = tf.expand_dims(
                common_layers.sample_with_temperature(logits, temperature), axis=1)
            decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
            return i + 1, next_id, decoded_ids, cache

        decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
        scores = None
        next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
        _, _, decoded_ids, _ = tf.while_loop(
            # TODO(llion): Early stopping.
            lambda i, *_: tf.less(i, decode_length),
            inner_loop,
            [tf.constant(0), next_id, decoded_ids, cache],
            shape_invariants=[
                tf.TensorShape([]),
                tf.TensorShape([None, None]),
                tf.TensorShape([None, None]),
                nest.map_structure(lambda t: tf.TensorShape(t.shape), cache),
            ])

    return decoded_ids, scores
def transformer_prepare_encoder(inputs,
                                target_space,
                                hparams,
                                features=None,
                                type_ids=None,
                                num_types=None,
                                reuse_target_embedding=tf.AUTO_REUSE):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.
    type_ids: optional, an int64 Tensor of shape [batch, length] that allows
      for adding type embeddings, similar to positional embeddings.
    num_types: optional, an int that decides the number of types in type_ids.
    reuse_target_embedding: option to reuse variable name in the case that
      symbol modalities are reused between inputs/targets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        if (hasattr(hparams, "unidirectional_encoder")
                and hparams.unidirectional_encoder):
            tf.logging.info("Using unidirectional encoder")
            encoder_self_attention_bias = (
                common_attention.attention_bias_lower_triangle(
                    common_layers.shape_list(inputs)[1]))
        else:
            encoder_self_attention_bias = (
                common_attention.attention_bias_same_segment(
                    inputs_segmentation, inputs_segmentation))
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        if (hasattr(hparams, "unidirectional_encoder")
                and hparams.unidirectional_encoder):
            tf.logging.info("Using unidirectional encoder")
            encoder_self_attention_bias = (
                common_attention.attention_bias_lower_triangle(
                    common_layers.shape_list(inputs)[1]))
        else:
            # Usual case - not a packed dataset.
            encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    if target_space is not None and hparams.get("use_target_space_embedding",
                                                True):
        # Append target_space_id embedding to inputs.
        emb_target_space = common_layers.embedding(
            target_space,
            32,
            ishape_static[-1],
            name="target_space_embedding",
            dtype=hparams.get("activation_dtype", "float32"),
            reuse=reuse_target_embedding)
        emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
        encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    elif hparams.pos == "timing_from_features":
        encoder_input = common_attention.add_timing_signals_from_features(
            encoder_input, features, hparams.position_features)
    elif hparams.pos == "emb":
        encoder_input = common_attention.add_positional_embedding(
            encoder_input, hparams.max_length, "inputs_positional_embedding",
            inputs_position)

    # Add type embeddings
    if type_ids is not None:
        if not num_types:
            raise ValueError("Need to set num_types as well.")
        encoder_input = common_attention.add_positional_embedding(
            encoder_input, num_types, "inputs_type_embedding", type_ids)

    encoder_self_attention_bias = common_layers.cast_like(
        encoder_self_attention_bias, encoder_input)
    encoder_decoder_attention_bias = common_layers.cast_like(
        encoder_decoder_attention_bias, encoder_input)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  if features and "inputs_segmentation" in features:
    # Packed dataset.  Keep the examples from seeing each other.
    inputs_segmentation = features["inputs_segmentation"]
    inputs_position = features["inputs_position"]
    targets_segmentation = features["targets_segmentation"]
    if (hasattr(hparams, "unidirectional_encoder") and
        hparams.unidirectional_encoder):
      tf.logging.info("Using unidirectional encoder")
      encoder_self_attention_bias = (
          common_attention.attention_bias_lower_triangle(
              common_layers.shape_list(inputs)[1]))
    else:
      encoder_self_attention_bias = (
          common_attention.attention_bias_same_segment(
              inputs_segmentation, inputs_segmentation))
    encoder_decoder_attention_bias = (
        common_attention.attention_bias_same_segment(targets_segmentation,
                                                     inputs_segmentation))
  else:
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    if (hasattr(hparams, "unidirectional_encoder") and
        hparams.unidirectional_encoder):
      tf.logging.info("Using unidirectional encoder")
      encoder_self_attention_bias = (
          common_attention.attention_bias_lower_triangle(
              common_layers.shape_list(inputs)[1]))
    else:
      # Usual case - not a packed dataset.
      encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    inputs_position = None
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(inputs)[1])
  if hparams.get("use_target_space_embedding", True):
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        target_space,
        32,
        ishape_static[-1],
        name="target_space_embedding",
        dtype=tf.bfloat16
        if hparams.activation_dtype == "bfloat16" else tf.float32)
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space

  # apply (sub)word dropout
  # encoder_input is of shape [batch_size, max_len, hidden_size]
  input_word_dropout = hparams.get("input_word_dropout", 0.0)
  if input_word_dropout:
    mask = tf.random_uniform([tf.shape(encoder_input)[0], tf.shape(encoder_input)[1], 1])
    encoder_input *= tf.to_float(tf.greater_equal(mask, input_word_dropout))

  if hparams.pos == "timing":
    if inputs_position is not None:
      encoder_input = common_attention.add_timing_signal_1d_given_position(
          encoder_input, inputs_position)
    else:
      encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  elif hparams.pos == "emb":
    encoder_input = common_attention.add_positional_embedding(
        encoder_input, hparams.max_length, "inputs_positional_embedding",
        inputs_position)
  if hparams.activation_dtype == "bfloat16":
    encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                          tf.bfloat16)
    encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias,
                                             tf.bfloat16)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    if hparams.get("use_target_space_embedding", True):
        # Append target_space_id embedding to inputs.
        emb_target_space = common_layers.embedding(
            target_space,
            32,
            ishape_static[-1],
            name="target_space_embedding",
            dtype=tf.bfloat16
            if hparams.activation_dtype == "bfloat16" else tf.float32)
        emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
        encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    elif hparams.pos == "emb":
        encoder_input = common_attention.add_positional_embedding(
            encoder_input, hparams.max_length, "inputs_positional_embedding",
            inputs_position)
    if hparams.activation_dtype == "bfloat16":
        encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                              tf.bfloat16)
        encoder_decoder_attention_bias = tf.cast(
            encoder_decoder_attention_bias, tf.bfloat16)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #14
0
    def testPadRemover(self):
        """Check that the padding remover is working correctly."""
        x_1 = tf.constant(
            [
                [1, 2, 3],
                [4, 5, 6],
                [7, 8, 9],
                [0, 0, 0],  # pad
                [0, 0, 0],  # pad
                [0, 0, 0],  # pad
                [10, 11, 12],
                [13, 14, 15],
                [0, 0, 0],  # pad
            ],
            dtype=tf.float32)
        # Get padding mask
        x_pad_mask = common_attention.embedding_to_padding(x_1)
        x_2 = tf.constant(
            [
                [1],
                [2],
                [3],
                [4],  # pad
                [5],  # pad
                [6],  # pad
                [7],
                [8],
                [9],  # pad
            ],
            dtype=tf.float32)
        x_3 = tf.constant(
            [
                1,
                2,
                3,
                4,  # pad
                5,  # pad
                6,  # pad
                7,
                8,
                9,  # pad
            ],
            dtype=tf.float32)

        pad_remover = expert_utils.PadRemover(x_pad_mask)

        y_1 = pad_remover.remove(x_1)
        y_2 = pad_remover.remove(x_2)
        y_3 = pad_remover.remove(x_3)

        z_1 = pad_remover.restore(y_1 * 2)
        z_2 = pad_remover.restore(y_2 * 2)
        z_3 = pad_remover.restore(y_3 * 2)

        with self.test_session() as sess:
            # Padding should have been removed
            self._verify_value(sess, y_1, [
                [1., 2., 3.],
                [4., 5., 6.],
                [7., 8., 9.],
                [10., 11., 12.],
                [13., 14., 15.],
            ])
            self._verify_value(sess, y_2, [
                [1.],
                [2.],
                [3.],
                [7.],
                [8.],
            ])
            self._verify_value(sess, y_3, [
                1.,
                2.,
                3.,
                7.,
                8.,
            ])

            # Padding should have been restored
            self._verify_value(sess, z_1, [
                [2., 4., 6.],
                [8., 10., 12.],
                [14., 16, 18.],
                [0., 0., 0.],
                [0., 0., 0.],
                [0., 0., 0.],
                [20., 22., 24.],
                [26., 28., 30.],
                [0., 0., 0.],
            ])
            self._verify_value(
                sess,
                z_2,
                [
                    [2.],
                    [4.],
                    [6.],
                    [0.],  # pad
                    [0.],  # pad
                    [0.],  # pad
                    [14.],
                    [16.],
                    [0.],  # pad
                ])
            self._verify_value(
                sess,
                z_3,
                [
                    2.,
                    4.,
                    6.,
                    0.,  # pad
                    0.,  # pad
                    0.,  # pad
                    14.,
                    16.,
                    0.,  # pad
                ])
Example #15
0
    def bottom(self, x):
        """Use batchnorm instead of CMVN and shorten the stft with strided convs.

    Args:
      x: float32 tensor with shape [batch_size, len, 1, freqs * channels]

    Returns:
      float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
    """
        inputs = x
        p = self._model_hparams

        num_mel_bins = p.audio_num_mel_bins
        num_channels = 3 if p.audio_add_delta_deltas else 1

        with tf.variable_scope(self.name):
            if p.audio_preproc_in_bottom:
                # Compute filterbanks
                with tf.variable_scope("fbanks"):
                    waveforms = tf.squeeze(inputs, [2, 3])
                    mel_fbanks = common_audio.compute_mel_filterbank_features(
                        waveforms,
                        sample_rate=p.audio_sample_rate,
                        dither=p.audio_dither,
                        preemphasis=p.audio_preemphasis,
                        frame_length=p.audio_frame_length,
                        frame_step=p.audio_frame_step,
                        lower_edge_hertz=p.audio_lower_edge_hertz,
                        upper_edge_hertz=p.audio_upper_edge_hertz,
                        num_mel_bins=p.audio_num_mel_bins,
                        apply_mask=True)
                    if p.audio_add_delta_deltas:
                        mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
                    x = tf.reshape(
                        mel_fbanks,
                        common_layers.shape_list(mel_fbanks)[:2] +
                        [num_mel_bins, num_channels])

                    nonpadding_mask = 1. - common_attention.embedding_to_padding(
                        x)
                    num_of_nonpadding_elements = tf.reduce_sum(
                        nonpadding_mask) * num_mel_bins * num_channels

                    # This replaces CMVN estimation on data
                    var_epsilon = 1e-09
                    mean = tf.reduce_sum(x, axis=[
                        1
                    ], keepdims=True) / num_of_nonpadding_elements
                    variance = (
                        num_of_nonpadding_elements * mean**2. -
                        2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) +
                        tf.reduce_sum(x**2, axis=[1], keepdims=True)
                    ) / num_of_nonpadding_elements
                    x = (x - mean) * tf.rsqrt(variance +
                                              var_epsilon) * tf.expand_dims(
                                                  nonpadding_mask, -1)
            else:
                x = inputs

            # The convention is that the models are flattened along the spatial,
            # dimensions, thus the speech preprocessor treats frequencies and
            # channels as image colors (last axis)
            x.set_shape([None, None, num_mel_bins, num_channels])

            # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
            x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]])
            for _ in range(2):
                x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False)
                x = common_layers.layer_norm(x)
                x = tf.nn.relu(x)

            xshape = common_layers.shape_list(x)
            # apply a conv that will remove all frequencies and at the same time
            # project the output into desired hidden_size
            x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
            x = tf.layers.conv2d(x,
                                 p.hidden_size, (3, xshape[2]),
                                 use_bias=False)

            assert common_layers.shape_list(x)[2] == 1
            x = common_layers.layer_norm(x)
            x = tf.nn.relu(x)
        return x
    def compute_knowledge_selection_and_loss(self, features, encoder_output,
                                             fact_embedding, fact_lengths,
                                             margin, num_negative_samples):
        """Compute knowledge selection and loss.

    Args:
      features: features.
      encoder_output: <tf.float32>[batch_size, input_length, hidden_dim]
      fact_embedding: <tf.float32>[batch_size*triple_num, max_triple_length,
        emb_dim]
      fact_lengths: # <tf.int32>[batch_size*triple_num]
      margin: integer value for max margin in TransE loss,
      num_negative_samples: shuffle and sample multiple negative examples for
      the TransE loss

    Returns:
      knowledge_weights:
      knowledge_loss:
    """
        hparams = self._hparams
        encoder_output_shape = common_layers.shape_list(encoder_output)
        encoder_hidden_dim = encoder_output_shape[-1]
        inputs = features["inputs"]
        # <tf.float32>[batch_size, input_length, emb_dim]
        inputs = tf.squeeze(inputs, 2)
        # <tf.float32>[batch_size, input_length]
        context_padding = common_attention.embedding_to_padding(inputs)
        # <tf.float32>[batch_size]
        context_lens = tf.to_float(
            common_attention.padding_to_length(context_padding))
        # <tf.float32>[batch_size, 1]
        context_lens = tf.expand_dims(context_lens, -1)
        # Compute context vector summary.
        # <tf.float32>[batch_size, hidden_dim]
        context_vector_summary = compute_summary_embedding(
            encoder_output, context_lens, hparams)
        knowledge_encoder_output = compute_average_embedding(
            fact_embedding, fact_lengths)
        # <tf.float32>[batch_size, triple_num, emb_dim]
        knowledge_encoder_output = tf.reshape(
            knowledge_encoder_output,
            [-1, self.triple_num, encoder_hidden_dim])
        original_knowledge_encoder_output = knowledge_encoder_output
        if hparams.similarity_fuction == "dot_product":
            triple_logits = tf.squeeze(
                tf.matmul(knowledge_encoder_output,
                          tf.expand_dims(context_vector_summary, 2)), -1)
        elif hparams.similarity_fuction == "bilinear":
            # Tile the context vector summary.
            # <tf.float32>[batch_size, triple_num*hidden_dim]
            tiled_context_vector = tf.tile(context_vector_summary,
                                           [1, self.triple_num])
            # <tf.float32>[batch_size, triple_num, hidden_dim]
            context_vector = tf.reshape(
                tiled_context_vector,
                [-1, self.triple_num, encoder_hidden_dim])
            # compute outer product
            context_vector = tf.expand_dims(context_vector, -1)
            knowledge_encoder_output = tf.expand_dims(knowledge_encoder_output,
                                                      2)
            # <tf.float32>[batch_size, triple_num, hidden_dim, hidden_dim]
            outer_product = tf.matmul(context_vector, knowledge_encoder_output)
            outer_product = tf.reshape(
                outer_product,
                [-1, self.triple_num, encoder_hidden_dim * encoder_hidden_dim])
            triple_logits = tf.squeeze(
                tf.layers.dense(outer_product, 1, name="knolwedge_final_mlp"),
                -1)

        avg_triple_loss = 0.0
        triple_labels = features["triple_labels"]

        subject_mask = tf.reshape(
            features["subject_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        subject_mask = tf.reshape(subject_mask,
                                  [-1, hparams.max_triple_length])

        predicate_mask = tf.reshape(
            features["predicate_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        predicate_mask = tf.reshape(predicate_mask,
                                    [-1, hparams.max_triple_length])

        object_mask = tf.reshape(
            features["object_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        object_mask = tf.reshape(object_mask, [-1, hparams.max_triple_length])

        # mask : [bs, max_seq_len, triple_num]
        # the below operation will result in [bs*triple_num,emb_dim]
        subject_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(subject_mask, -1), 1),
            tf.float32)  # [bs*tn]
        object_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(object_mask, -1), 1), tf.float32)
        predicate_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(predicate_mask, -1), 1), tf.float32)

        # expand dimension 2 to be able to broadcast
        subject_mask = tf.cast(tf.expand_dims(subject_mask, 2), tf.float32)
        predicate_mask = tf.cast(tf.expand_dims(predicate_mask, 2), tf.float32)
        object_mask = tf.cast(tf.expand_dims(object_mask, 2), tf.float32)

        subject_vect = tf.reduce_sum(tf.multiply(
            fact_embedding, subject_mask), 1) / (
                subject_length +
                tf.broadcast_to(tf.constant([1e-5]), tf.shape(subject_length)))
        object_vect = tf.reduce_sum(tf.multiply(
            fact_embedding, object_mask), 1) / (
                object_length +
                tf.broadcast_to(tf.constant([1e-5]), tf.shape(object_length)))
        predicate_vect = tf.reduce_sum(
            tf.multiply(fact_embedding, predicate_mask),
            1) / (predicate_length + tf.broadcast_to(
                tf.constant([1e-5]), tf.shape(predicate_length)))

        # Shuffled rows to generate adversarial samples
        shuffled_subject_vect = []
        shuffled_object_vect = []

        for _ in range(num_negative_samples):
            shuffled_subject_vect += [
                tf.gather(
                    subject_vect,
                    tf.random.shuffle(tf.range(tf.shape(subject_vect)[0])))
            ]  # [bs*tn,d]
            shuffled_object_vect += [
                tf.gather(
                    object_vect,
                    tf.random.shuffle(tf.range(tf.shape(object_vect)[0])))
            ]  # [bs*tn,d]

        # KB pretraining loss

        positive_loss = tf.reduce_mean(
            tf.squared_difference(subject_vect + predicate_vect, object_vect))
        negative_loss = 0
        for n_adv in range(num_negative_samples):
            negative_loss += tf.reduce_mean(
                tf.squared_difference(
                    shuffled_subject_vect[n_adv] + predicate_vect,
                    object_vect))
            negative_loss += tf.reduce_mean(
                tf.squared_difference(subject_vect + predicate_vect,
                                      shuffled_object_vect[n_adv]))

        # TransE Loss

        negative_loss = negative_loss / (2 * num_negative_samples)

        transe_loss = tf.clip_by_value(margin + positive_loss - negative_loss,
                                       clip_value_min=0,
                                       clip_value_max=100)
        if hparams.mode != tf.estimator.ModeKeys.PREDICT:
            triple_losses = tf.nn.weighted_cross_entropy_with_logits(
                labels=triple_labels,
                logits=triple_logits,
                pos_weight=hparams.pos_weight)
            avg_triple_loss = tf.reduce_mean(triple_losses)
            tf.summary.scalar("triple_loss", avg_triple_loss)

        return triple_logits, avg_triple_loss, original_knowledge_encoder_output, transe_loss
Example #17
0
  def compute_knowledge_selection_and_loss(self, features, encoder_output,
                                           fact_embedding, fact_lengths):
    """Compute knowledge selection and loss.

    Args:
      features: features.
      encoder_output: <tf.float32>[batch_size, input_length, hidden_dim]
      fact_embedding: <tf.float32>[batch_size*max_triple_num, max_triple_length,
        emb_dim]
      fact_lengths: # <tf.int32>[batch_size*max_triple_num]

    Returns:
      knowledge_weights:
      knowledge_loss:
    """
    hparams = self._hparams
    encoder_output_shape = common_layers.shape_list(encoder_output)
    encoder_hidden_dim = encoder_output_shape[-1]
    inputs = features["inputs"]
    # <tf.float32>[batch_size, input_length, emb_dim]
    inputs = tf.squeeze(inputs, 2)
    # <tf.float32>[batch_size, input_length]
    context_padding = common_attention.embedding_to_padding(inputs)
    # <tf.float32>[batch_size]
    context_lens = tf.to_float(
        common_attention.padding_to_length(context_padding))
    # <tf.float32>[batch_size, 1]
    context_lens = tf.expand_dims(context_lens, -1)
    # Compute context vector summary.
    # <tf.float32>[batch_size, hidden_dim]
    context_vector_summary = compute_summary_embedding(encoder_output,
                                                       context_lens, hparams)
    knowledge_encoder_output = compute_average_embedding(
        fact_embedding, fact_lengths)
    # <tf.float32>[batch_size, triple_num, emb_dim]
    knowledge_encoder_output = tf.reshape(
        knowledge_encoder_output, [-1, self.triple_num, encoder_hidden_dim])
    original_knowledge_encoder_output = knowledge_encoder_output
    if hparams.similarity_fuction == "dot_product":
      triple_logits = tf.squeeze(
          tf.matmul(knowledge_encoder_output,
                    tf.expand_dims(context_vector_summary, 2)), -1)
    elif hparams.similarity_fuction == "bilinear":
      # Tile the context vector summary.
      # <tf.float32>[batch_size, max_triple_num*hidden_dim]
      tiled_context_vector = tf.tile(context_vector_summary,
                                     [1, self.triple_num])
      # <tf.float32>[batch_size, max_triple_num, hidden_dim]
      context_vector = tf.reshape(tiled_context_vector,
                                  [-1, self.triple_num, encoder_hidden_dim])
      # compute outer product
      context_vector = tf.expand_dims(context_vector, -1)
      knowledge_encoder_output = tf.expand_dims(knowledge_encoder_output, 2)
      # <tf.float32>[batch_size, max_triple_num, hidden_dim, hidden_dim]
      outer_product = tf.matmul(context_vector, knowledge_encoder_output)
      outer_product = tf.reshape(
          outer_product,
          [-1, self.triple_num, encoder_hidden_dim * encoder_hidden_dim])
      triple_logits = tf.squeeze(
          tf.layers.dense(outer_product, 1, name="knolwedge_final_mlp"), -1)

    avg_triple_loss = 0.0
    triple_labels = features["triple_labels"]
    triple_labels = triple_labels[:, :self.triple_num]
    if hparams.mode != tf.estimator.ModeKeys.PREDICT:
      triple_losses = tf.nn.weighted_cross_entropy_with_logits(
          labels=triple_labels,
          logits=triple_logits,
          pos_weight=hparams.pos_weight)
      avg_triple_loss = tf.reduce_mean(triple_losses)
      tf.summary.scalar("triple_loss", avg_triple_loss)

    return triple_logits, avg_triple_loss, original_knowledge_encoder_output
Example #18
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(target_space,
                                               32,
                                               ishape_static[-1],
                                               name="target_space_embedding")
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space
    #if hparams.pos == "timing":
    #  if inputs_position is not None:
    #    encoder_input = common_attention.add_timing_signal_1d_given_position(
    #        encoder_input, inputs_position)
    #  else:
    #    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
    raw_encoder_input = tf.squeeze(features['inputs_raw'], axis=[-2, -1])
    pos_signals = generate_positional_signals(raw_encoder_input, hparams)
    pos_embeddings = generate_positional_embeddings(pos_signals,
                                                    hparams.encoder_pos,
                                                    hparams)
    if "sum" in hparams.encoder_pos_integration:
        encoder_input = encoder_input + pos_embeddings
    elif "ffn" in hparams.encoder_pos_integration:
        with tf.variable_scope("encoder_pos_ffn"):
            encoder_input = tf.concat([encoder_input, pos_embeddings], axis=2)
            encoder_input = transformer_ffn_layer(encoder_input,
                                                  hparams,
                                                  conv_padding="SAME")
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #19
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
      sg: inputs here have been flattened to 3d
        [batch, height, width, embed_size] ->
        [batch, height*width, embed_size]
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        # sg: [batch_size, sentence_len]
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        # sg: [batch_size, 1, 1, sentence_len]
        # an bias tensor to be added to attention logits
        # for padded words, the biases equal -1e9
        # non padded words equal 0
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        target_space,
        32,
        # sg: 32 vocab_size (comments in fun, may be not exactly)
        # this is because at current time t2t only have
        # SpaceID in problem.py from 1 to 32
        ishape_static[-1],
        # sg: embedding dimension
        name="target_space_embedding",
        dtype=tf.bfloat16
        if hparams.activation_dtype == "bfloat16" else tf.float32)
    # sg: [1,128] a dense vector to represent SpaceID
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    # sg: [1,1,128]
    encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    if hparams.activation_dtype == "bfloat16":
        encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                              tf.bfloat16)
        encoder_decoder_attention_bias = tf.cast(
            encoder_decoder_attention_bias, tf.bfloat16)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)