Example #1
0
    def test_high_dim_attention(self, q_dims, v_dims, mask_dims,
                                attention_axes):
        """Test with a mask tensor."""
        test_layer = attention.MultiHeadAttention(
            num_heads=2, key_size=2, attention_axes=attention_axes)
        batch_size, hidden_size = 3, 8
        # Generate data for the input (non-mask) tensors.
        query_shape = [batch_size] + q_dims + [hidden_size]
        value_shape = [batch_size] + v_dims + [hidden_size]
        mask_shape = [batch_size] + mask_dims
        query = 10 * np.random.random_sample(query_shape)
        value = 10 * np.random.random_sample(value_shape)

        # Invoke the data with a random set of mask data. This should mask at least
        # one element.
        mask_data = np.random.randint(2, size=mask_shape).astype("bool")
        output = test_layer(query=query, value=value, attention_mask=mask_data)

        # Invoke the same data, but with a null mask (where no elements are masked).
        null_mask_data = np.ones(mask_shape)
        unmasked_output = test_layer(query=query,
                                     value=value,
                                     attention_mask=null_mask_data)
        # Because one data is masked and one is not, the outputs should not be the
        # same.
        self.assertNotAllClose(output, unmasked_output)
Example #2
0
 def test_non_masked_self_attention(self):
     """Test with one input (self-attenntion) and no mask tensor."""
     test_layer = attention.MultiHeadAttention(num_heads=12, key_size=64)
     # Create a 3-dimensional input (the first dimension is implicit).
     query = tf.keras.Input(shape=(40, 80))
     output = test_layer([query, query])
     self.assertEqual(output.shape.as_list(), [None, 40, 80])
Example #3
0
    def test_masked_attention(self):
        """Test with a mask tensor."""
        test_layer = attention.MultiHeadAttention(num_heads=2, head_size=2)
        # Create a 3-dimensional input (the first dimension is implicit).
        from_tensor = tf.keras.Input(shape=(4, 8))
        to_tensor = tf.keras.Input(shape=(2, 8))
        mask_tensor = tf.keras.Input(shape=(4, 2))
        output = test_layer([from_tensor, to_tensor, mask_tensor])

        # Create a model containing the test layer.
        model = tf.keras.Model([from_tensor, to_tensor, mask_tensor], output)

        # Generate data for the input (non-mask) tensors.
        from_data = 10 * np.random.random_sample((3, 4, 8))
        to_data = 10 * np.random.random_sample((3, 2, 8))

        # Invoke the data with a random set of mask data. This should mask at least
        # one element.
        mask_data = np.random.randint(2, size=(3, 4, 2))
        masked_output_data = model.predict([from_data, to_data, mask_data])

        # Invoke the same data, but with a null mask (where no elements are masked).
        null_mask_data = np.ones((3, 4, 2))
        unmasked_output_data = model.predict(
            [from_data, to_data, null_mask_data])

        # Because one data is masked and one is not, the outputs should not be the
        # same.
        self.assertNotAllClose(masked_output_data, unmasked_output_data)
Example #4
0
 def test_non_masked_attention(self):
     """Test that the attention layer can be created without a mask tensor."""
     test_layer = attention.MultiHeadAttention(num_heads=12, head_size=64)
     # Create a 3-dimensional input (the first dimension is implicit).
     from_tensor = tf.keras.Input(shape=(40, 80))
     to_tensor = tf.keras.Input(shape=(20, 80))
     output = test_layer([from_tensor, to_tensor])
     self.assertEqual(output.shape.as_list(), [None, 40, 12, 64])
Example #5
0
    def test_masked_attention(self, use_bias):
        """Test with a mask tensor."""
        test_layer = attention.MultiHeadAttention(num_heads=2,
                                                  key_size=2,
                                                  use_bias=use_bias)
        # Create a 3-dimensional input (the first dimension is implicit).
        batch_size = 3
        query = tf.keras.Input(shape=(4, 8))
        value = tf.keras.Input(shape=(2, 8))
        mask_tensor = tf.keras.Input(shape=(4, 2))
        output = test_layer(query=query,
                            value=value,
                            attention_mask=mask_tensor)

        # Create a model containing the test layer.
        model = tf.keras.Model([query, value, mask_tensor], output)

        # Generate data for the input (non-mask) tensors.
        from_data = 10 * np.random.random_sample((batch_size, 4, 8))
        to_data = 10 * np.random.random_sample((batch_size, 2, 8))

        # Invoke the data with a random set of mask data. This should mask at least
        # one element.
        mask_data = np.random.randint(2, size=(batch_size, 4, 2))
        masked_output_data = model.predict([from_data, to_data, mask_data])

        # Invoke the same data, but with a null mask (where no elements are masked).
        null_mask_data = np.ones((batch_size, 4, 2))
        unmasked_output_data = model.predict(
            [from_data, to_data, null_mask_data])

        # Because one data is masked and one is not, the outputs should not be the
        # same.
        self.assertNotAllClose(masked_output_data, unmasked_output_data)

        # Tests the layer with three inputs: Q, K, V.
        key = tf.keras.Input(shape=(2, 8))
        output = test_layer(query,
                            value=value,
                            key=key,
                            attention_mask=mask_tensor)
        model = tf.keras.Model([query, value, key, mask_tensor], output)

        masked_output_data = model.predict(
            [from_data, to_data, to_data, mask_data])
        unmasked_output_data = model.predict(
            [from_data, to_data, to_data, null_mask_data])
        # Because one data is masked and one is not, the outputs should not be the
        # same.
        self.assertNotAllClose(masked_output_data, unmasked_output_data)

        if use_bias:
            self.assertLen(test_layer._query_dense.trainable_variables, 2)
            self.assertLen(test_layer._output_dense.trainable_variables, 2)
        else:
            self.assertLen(test_layer._query_dense.trainable_variables, 1)
            self.assertLen(test_layer._output_dense.trainable_variables, 1)
Example #6
0
 def test_attention_scores(self):
     """Test attention outputs with coefficients."""
     test_layer = attention.MultiHeadAttention(num_heads=12,
                                               key_size=64,
                                               return_attention_scores=True)
     # Create a 3-dimensional input (the first dimension is implicit).
     query = tf.keras.Input(shape=(40, 80))
     output, coef = test_layer([query, query])
     self.assertEqual(output.shape.as_list(), [None, 40, 80])
     self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
Example #7
0
 def test_initializer(self):
   """Test with a specified initializer."""
   test_layer = attention.MultiHeadAttention(
       num_heads=12,
       key_size=64,
       kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
   # Create a 3-dimensional input (the first dimension is implicit).
   query = tf.keras.Input(shape=(40, 80))
   output = test_layer([query, query])
   self.assertEqual(output.shape.as_list(), [None, 40, 80])
Example #8
0
 def test_non_masked_attention(self, value_size, output_shape, output_dims):
     """Test that the attention layer can be created without a mask tensor."""
     test_layer = attention.MultiHeadAttention(num_heads=12,
                                               key_size=64,
                                               value_size=value_size,
                                               output_shape=output_shape)
     # Create a 3-dimensional input (the first dimension is implicit).
     query = tf.keras.Input(shape=(40, 80))
     value = tf.keras.Input(shape=(20, 80))
     output = test_layer([query, value])
     self.assertEqual(output.shape.as_list(), [None] + output_dims)
Example #9
0
  def build(self, input_shape):
    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
    input_tensor_shape = tf.TensorShape(input_tensor)
    if len(input_tensor_shape) != 3:
      raise ValueError("TransformerLayer expects a three-dimensional input of "
                       "shape [batch, sequence, width].")
    batch_size, sequence_length, hidden_size = input_tensor_shape

    if len(input_shape) == 2:
      mask_tensor_shape = tf.TensorShape(input_shape[1])
      expected_mask_tensor_shape = tf.TensorShape(
          [batch_size, sequence_length, sequence_length])
      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
        raise ValueError("When passing a mask tensor to TransformerLayer, the "
                         "mask tensor must be of shape [batch, "
                         "sequence_length, sequence_length] (here %s). Got a "
                         "mask tensor of shape %s." %
                         (expected_mask_tensor_shape, mask_tensor_shape))
    if hidden_size % self._num_heads != 0:
      raise ValueError(
          "The input size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self._num_heads))
    self._attention_head_size = int(hidden_size // self._num_heads)
    common_kwargs = dict(
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
        bias_constraint=self._bias_constraint)
    self._attention_layer = attention.MultiHeadAttention(
        num_heads=self._num_heads,
        key_size=self._attention_head_size,
        dropout=self._attention_dropout_rate,
<<<<<<< HEAD
        name="self_attention",
        **common_kwargs)
    # pylint: disable=protected-access
    # Temporarily handling for checkpoint compatible changes.
    self._attention_layer._build_from_signature(
        query=input_tensor_shape, value=input_tensor_shape)
    self._attention_output_dense = self._attention_layer._output_dense
Example #10
0
    def build(self, input_shape):
        input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
        input_tensor_shape = tf.TensorShape(input_tensor)
        if len(input_tensor_shape) != 3:
            raise ValueError(
                "TransformerLayer expects a three-dimensional input of "
                "shape [batch, sequence, width].")
        batch_size, sequence_length, hidden_size = input_tensor_shape

        if len(input_shape) == 2:
            mask_tensor_shape = tf.TensorShape(input_shape[1])
            expected_mask_tensor_shape = tf.TensorShape(
                [batch_size, sequence_length, sequence_length])
            if not expected_mask_tensor_shape.is_compatible_with(
                    mask_tensor_shape):
                raise ValueError(
                    "When passing a mask tensor to TransformerLayer, the "
                    "mask tensor must be of shape [batch, "
                    "sequence_length, sequence_length] (here %s). Got a "
                    "mask tensor of shape %s." %
                    (expected_mask_tensor_shape, mask_tensor_shape))
        if hidden_size % self._num_heads != 0:
            raise ValueError(
                "The input size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, self._num_heads))
        self._attention_head_size = int(hidden_size // self._num_heads)

        self._attention_layer = attention.MultiHeadAttention(
            num_heads=self._num_heads,
            key_size=self._attention_head_size,
            dropout=self._attention_dropout_rate,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="self_attention")
        self._attention_dropout = tf.keras.layers.Dropout(
            rate=self._dropout_rate)
        if self._use_layer_norm:
            # Use float32 in layernorm for numeric stability.
            # It is probably safe in mixed_float16, but we haven't validated this yet.
            self._attention_layer_norm = (tf.keras.layers.LayerNormalization(
                name="self_attention_layer_norm",
                axis=-1,
                epsilon=1e-12,
                dtype=tf.float32))
        self._intermediate_dense = dense_einsum.DenseEinsum(
            output_shape=self._intermediate_size,
            activation=None,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="intermediate")
        self._intermediate_activation_layer = tf.keras.layers.Activation(
            self._intermediate_activation)
        self._output_dense = dense_einsum.DenseEinsum(
            output_shape=hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="output")
        self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
        if self._use_layer_norm:
            # Use float32 in layernorm for numeric stability.
            self._output_layer_norm = tf.keras.layers.LayerNormalization(
                name="output_layer_norm",
                axis=-1,
                epsilon=1e-12,
                dtype=tf.float32)

        self._rezero_a = self.add_weight(
            name="rezero_alpha",
            initializer=tf.keras.initializers.Zeros(),
            trainable=True,
            dtype=tf.float32)

        super(ReZeroTransformer, self).build(input_shape)
Example #11
0
  def build(self, input_shape):
    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
    input_tensor_shape = tf.TensorShape(input_tensor)
    if len(input_tensor_shape) != 3:
      raise ValueError("TransformerLayer expects a three-dimensional input of "
                       "shape [batch, sequence, width].")
    batch_size, sequence_length, hidden_size = input_tensor_shape

    if len(input_shape) == 2:
      mask_tensor_shape = tf.TensorShape(input_shape[1])
      expected_mask_tensor_shape = tf.TensorShape(
          [batch_size, sequence_length, sequence_length])
      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
        raise ValueError("When passing a mask tensor to TransformerLayer, the "
                         "mask tensor must be of shape [batch, "
                         "sequence_length, sequence_length] (here %s). Got a "
                         "mask tensor of shape %s." %
                         (expected_mask_tensor_shape, mask_tensor_shape))
    if hidden_size % self._num_heads != 0:
      raise ValueError(
          "The input size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self._num_heads))
    self._attention_head_size = int(hidden_size // self._num_heads)
    common_kwargs = dict(
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
        bias_constraint=self._bias_constraint)
    self._attention_layer = attention.MultiHeadAttention(
        num_heads=self._num_heads,
        key_size=self._attention_head_size,
        dropout=self._attention_dropout_rate,
        use_bias=self._use_bias,
        kernel_initializer=self._attention_initializer,
        name="self_attention",
        **common_kwargs)
    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    # Use float32 in layernorm for numeric stability.
    # It is probably safe in mixed_float16, but we haven't validated this yet.
    self._attention_layer_norm = (
        tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm",
            axis=-1,
            epsilon=self._norm_epsilon,
            dtype=tf.float32))
    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
        "abc,cd->abd",
        output_shape=(None, self._intermediate_size),
        bias_axes="d",
        kernel_initializer=self._kernel_initializer,
        name="intermediate",
        **common_kwargs)
    policy = tf.keras.mixed_precision.experimental.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
      # as well, so we use float32.
      # TODO(b/154538392): Investigate this.
      policy = tf.float32
    self._intermediate_activation_layer = tf.keras.layers.Activation(
        self._intermediate_activation, dtype=policy)
    self._intermediate_dropout_layer = tf.keras.layers.Dropout(
        rate=self._intermediate_dropout)
    self._output_dense = tf.keras.layers.experimental.EinsumDense(
        "abc,cd->abd",
        output_shape=(None, hidden_size),
        bias_axes="d",
        name="output",
        kernel_initializer=self._kernel_initializer,
        **common_kwargs)
    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    # Use float32 in layernorm for numeric stability.
    self._output_layer_norm = tf.keras.layers.LayerNormalization(
        name="output_layer_norm",
        axis=-1,
        epsilon=self._norm_epsilon,
        dtype=tf.float32)

    super(Transformer, self).build(input_shape)
Example #12
0
    def build(self, input_shape):
        # 输入 + 遮挡
        input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
        input_tensor_shape = tf.TensorShape(input_tensor)
        if len(input_tensor_shape) != 3:
            raise ValueError(
                "TransformerLayer expects a three-dimensional input of "
                "shape [batch, sequence, width].")
        batch_size, sequence_length, hidden_size = input_tensor_shape

        if len(input_shape) == 2:
            mask_tensor_shape = tf.TensorShape(input_shape[1])
            expected_mask_tensor_shape = tf.TensorShape(
                [batch_size, sequence_length, sequence_length])
            if not expected_mask_tensor_shape.is_compatible_with(
                    mask_tensor_shape):
                raise ValueError(
                    "When passing a mask tensor to TransformerLayer, the "
                    "mask tensor must be of shape [batch, "
                    "sequence_length, sequence_length] (here %s). Got a "
                    "mask tensor of shape %s." %
                    (expected_mask_tensor_shape, mask_tensor_shape))
        # 多头注意力
        if hidden_size % self._num_heads != 0:
            raise ValueError(
                "The input size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, self._num_heads))
        self._attention_head_size = int(hidden_size // self._num_heads)

        # 注意力层
        self._attention_layer = attention.MultiHeadAttention(
            num_heads=self._num_heads,
            key_size=self._attention_head_size,
            dropout=self._attention_dropout_rate,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="self_attention")
        # pylint: disable=protected-access
        # 注意力层,手动创建,确定输入尺寸
        self._attention_layer.build([input_tensor_shape] * 3)

        self._attention_output_dense = self._attention_layer._output_dense
        # pylint: enable=protected-access

        self._attention_dropout = tf.keras.layers.Dropout(
            rate=self._dropout_rate)
        # Use float32 in layernorm for numeric stability.
        # It is probably safe in mixed_float16, but we haven't validated this yet.

        self._attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm",
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32))

        # 中间层
        self._intermediate_dense = dense_einsum.DenseEinsum(
            output_shape=self._intermediate_size,
            activation=None,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="intermediate")
        policy = tf.keras.mixed_precision.experimental.global_policy()
        if policy.name == "mixed_bfloat16":
            # bfloat16 causes BERT with the LAMB optimizer to not converge
            # as well, so we use float32.
            # TODO(b/154538392): Investigate this.
            policy = tf.float32

        # 中间层激活函数
        self._intermediate_activation_layer = tf.keras.layers.Activation(
            self._intermediate_activation, dtype=policy)

        # 输出的线性层
        self._output_dense = dense_einsum.DenseEinsum(
            output_shape=hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="output")
        self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
        # Use float32 in layernorm for numeric stability.
        self._output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)

        super(Transformer, self).build(input_shape)