Example #1
0
  def call(self, input_tensor, labels=None, training=None):
    last_dim = utils.get_shape_list(input_tensor)[-1]
    input_tensor = utils.dropout(input_tensor, self.dropout_prob, training)

    if self.w is None:
      self.w = tf.compat.v1.get_variable(
          name="kernel",
          shape=[last_dim, self.num_labels],
          initializer=self.initializer)
      self.initializer = None
      self._trainable_weights.append(self.w)
    logits = tf.matmul(input_tensor, self.w)

    if self.use_bias:
      if self.b is None:
        self.b = tf.compat.v1.get_variable(
            name="bias",
            shape=[self.num_labels],
            initializer=tf.zeros_initializer)
        self._trainable_weights.append(self.b)
      logits = tf.nn.bias_add(logits, self.b)

    log_probs = tf.nn.log_softmax(logits, axis=-1)
    if labels is not None:
      one_hot_labels = tf.one_hot(labels, depth=self.num_labels,
                                  dtype=tf.float32)
      per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
      loss = tf.reduce_mean(per_example_loss)
    else:
      loss = tf.constant(0.0)

    return loss, log_probs
Example #2
0
def original_full_attention(query_layer, key_layer, value_layer,
                            attention_mask, size_per_head,
                            attention_probs_dropout_prob):
    """Full quadratic attention calculation.

  Args:
    query_layer: float Tensor of shape [batch_size, num_attention_heads,
      from_seq_length, size_per_head]
    key_layer: float Tensor of shape [batch_size, num_attention_heads,
      to_seq_length, size_per_head]
    value_layer: float Tensor of shape [batch_size, num_attention_heads,
      to_seq_length, size_per_head]
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    size_per_head: (optional) int. Size of each attention head.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.

  Returns:
    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
      size_per_head].
  """

    # Directly take n^2 dot product between "query" and "key".
    attention_scores = tf.einsum("BNFH,BNTH->BNFT", query_layer, key_layer)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / np.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_probs = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs = utils.dropout(attention_probs,
                                    attention_probs_dropout_prob)

    # `context_layer` = [B, F, N, H]
    context_layer = tf.einsum("BNFT,BNTH->BFNH", attention_probs, value_layer)
    return context_layer
Example #3
0
  def call(self,
           layer_input,
           encoder_outputs,
           self_attention_mask,
           attention_mask,
           cache=None,
           decode_i=None,
           training=None):
    """Implements a decoder layer of a transformer in BERT style.

    The layer_norm is taken after self-attention.

    Args:
      layer_input: float Tensor of shape [batch_size, seq_length, hidden_size].
      encoder_outputs: tensors with shape [batch_size, input_length,
          num_hidden_layers, hidden_size]
      self_attention_mask: bias for decoder self-attention layer. [1, 1,
        target_length, target_length]
      attention_mask: bias for encoder-decoder attention layer. [batch_size, 1,
        1, input_length]
      cache: (Used during prediction) A dictionary with tensors containing
        results of previous attentions. The dictionary must have the items:
            {"k": tensor with shape
                  [batch_size, max_len, num_attention_heads, size_per_head],
             "v": tensor with shape
                  [batch_size, max_len, num_attention_heads, size_per_head]}
      decode_i: (Used during prediction) current location of decoding
      training: Boolean indicating whether the call is training or inference.

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size].

    Raises:
      ValueError: Any of the arguments or tensor shapes are invalid.
      NotImplementedError: For unknown attention type.
    """
    with tf.compat.v1.variable_scope("attention"):
      with tf.compat.v1.variable_scope("self") as sc:
        self_attention_output = self.self_attn_layer(
            layer_input, layer_input, self_attention_mask,
            cache=cache, decode_i=decode_i, training=training, scope=sc)

      # Run a linear projection of `hidden_size` then add a residual
      # with `layer_input`.
      with tf.compat.v1.variable_scope("output"):
        self_attention_output = self.self_proj_layer(self_attention_output)
        self_attention_output = utils.dropout(self_attention_output,
                                              self.hidden_dropout_prob,
                                              training)
        self_attention_output = self.first_layer_norm(
            self_attention_output + layer_input)

      with tf.compat.v1.variable_scope("encdec") as sc:
        attention_output = self.cross_attn_layer(
            self_attention_output, encoder_outputs, attention_mask,
            training=training, scope=sc)

      # Run a linear projection of `hidden_size` then add a residual
      # with `layer_input`.
      with tf.compat.v1.variable_scope("encdec_output"):
        attention_output = self.cross_proj_layer(attention_output)
        attention_output = utils.dropout(attention_output,
                                         self.hidden_dropout_prob,
                                         training)
        attention_output = self.second_layer_norm(
            attention_output + self_attention_output)

    # The activation is only applied to the "intermediate" hidden layer.
    with tf.compat.v1.variable_scope("intermediate"):
      intermediate_output = self.expand_layer(attention_output)

    # Down-project back to `hidden_size` then add the residual.
    with tf.compat.v1.variable_scope("output"):
      layer_output = self.contract_layer(intermediate_output)
      layer_output = utils.dropout(layer_output,
                                   self.hidden_dropout_prob,
                                   training)
      layer_output = self.third_layer_norm(layer_output + attention_output)
    return layer_output
Example #4
0
  def operation(self,
           layer_input,
           attention_mask=None,
           band_mask=None,
           from_mask=None,
           to_mask=None,
           input_blocked_mask=None,
           training=None):
    """Implements a encoder layer of a transformer in Pegasus style.

    Args:
      layer_input: float Tensor of shape [batch_size, seq_length, hidden_size].
      attention_mask: (optional) int32 Tensor of shape [batch_size,
        seq_length, seq_length]. The values should be 1 or 0. The
        attention scores will effectively be set to -infinity for any positions
        in the mask that are 0, and will be unchanged for positions that are 1.
      band_mask: (optional) int32 Tensor of shape [batch_size, 1,
        seq_length//block_size-4, block_size, 3*block_size].
        The values should be 1 or 0. The attention scores will effectively be
        set to -infinity for any positions in the mask that are 0, and will be
        unchanged for positions that are 1.
      from_mask: (optional) int32 Tensor of shape [batch_size, 1,
        seq_length, 1]. The values should be 1 or 0. The
        attention scores will effectively be set to -infinity for any positions
        in the mask that are 0, and will be unchanged for positions that are 1.
      to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1,
        seq_length]. The values should be 1 or 0. The
        attention scores will effectively be set to -infinity for any positions
        in the mask that are 0, and will be unchanged for positions that are 1.
      input_blocked_mask: (optional) int32 Tensor of shape [batch_size,
        seq_length//block_size, block_size]. Same as from/to_mask, just
        reshaped.
      training: Boolean indicating whether the call is training or inference.

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size].

    Raises:
      ValueError: Any of the arguments or tensor shapes are invalid.
      NotImplementedError: For unknown attention type.
    """

    with tf.compat.v1.variable_scope("attention"):
      with tf.compat.v1.variable_scope("self") as sc:
        normalized_layer_input = self.first_layer_norm(layer_input)
        attention_output = self.attn_layer.operation(
            normalized_layer_input, normalized_layer_input,
            attention_mask, band_mask, from_mask, to_mask,
            input_blocked_mask, input_blocked_mask, training, scope=sc)

      # Run a linear projection of `hidden_size` then add a residual
      # with `layer_input`.
      with tf.compat.v1.variable_scope("output"):
        attention_output = self.projection_layer.operation(attention_output)
        attention_output = utils.dropout(attention_output,
                                         self.hidden_dropout_prob,
                                         training)
        attention_output = attention_output + layer_input

    # The activation is only applied to the "intermediate" hidden layer.
    with tf.compat.v1.variable_scope("intermediate"):
      normalized_attention_output = self.second_layer_norm.operation(attention_output)
      intermediate_output = self.expand_layer.operation(normalized_attention_output)

    # Down-project back to `hidden_size` then add the residual.
    with tf.compat.v1.variable_scope("output"):
      layer_output = self.contract_layer.operation(intermediate_output)
      layer_output = utils.dropout(layer_output,
                                   self.hidden_dropout_prob,
                                   training)
      layer_output = layer_output + attention_output
    return layer_output