Ejemplo n.º 1
0
    def __init__(self,
                 num_attention_heads,
                 inner_dim=768,
                 inner_activation=tf_utils.get_activation("gelu"),
                 dropout_rate=0.0,
                 attention_dropout_rate=0.0,
                 output_range=None,
                 kernel_initializer="glorot_uniform",
                 bias_initializer="zeros",
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 use_layer_norm=False,
                 share_rezero=True,
                 **kwargs):
        # attention_dropout will override attention_dropout_rate.
        # This is to unify the input params with TransformerEncoderBlock.
        attention_dropout_rate = kwargs.pop("attention_dropout",
                                            attention_dropout_rate)
        dropout_rate = kwargs.pop("output_dropout", dropout_rate)
        inner_dim = kwargs.pop("intermediate_size", inner_dim)
        inner_activation = kwargs.pop("intermediate_activation",
                                      inner_activation)
        util.filter_kwargs(kwargs)
        super().__init__(**kwargs)

        self._num_heads = num_attention_heads
        self._inner_dim = inner_dim
        self._inner_activation = inner_activation
        self._attention_dropout_rate = attention_dropout_rate
        self._dropout_rate = dropout_rate
        self._output_range = output_range
        self._kernel_initializer = tf.keras.initializers.get(
            kernel_initializer)
        self._bias_initializer = tf.keras.initializers.get(bias_initializer)
        self._kernel_regularizer = tf.keras.regularizers.get(
            kernel_regularizer)
        self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self._bias_constraint = tf.keras.constraints.get(bias_constraint)
        self._use_layer_norm = use_layer_norm
        self._share_rezero = share_rezero
Ejemplo n.º 2
0
    def __init__(self,
                 inner_dim=768,
                 inner_activation=tf_utils.get_activation("gelu"),
                 dropout=0.0,
                 use_gate=True,
                 apply_output_layer_norm=True,
                 num_blocks=1,
                 dropout_position="before_residual",
                 kernel_initializer="glorot_uniform",
                 bias_initializer="zeros",
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        inner_dim = kwargs.pop("intermediate_size", inner_dim)
        inner_activation = kwargs.pop("intermediate_activation",
                                      inner_activation)
        util.filter_kwargs(kwargs)
        super().__init__(**kwargs)
        self._inner_dim = inner_dim
        self._inner_activation = inner_activation
        self._dropout = dropout
        self._use_gate = use_gate
        self._num_blocks = num_blocks
        self._apply_output_layer_norm = apply_output_layer_norm
        self._dropout_position = dropout_position
        if self._dropout_position not in ("before_residual", "after_residual"):
            raise ValueError(
                "The dropout_position should be either `before_residual` or"
                "`after_residual`, got: %s" % self._dropout_position)

        self._kernel_initializer = tf.keras.initializers.get(
            kernel_initializer)
        self._bias_initializer = tf.keras.initializers.get(bias_initializer)
        self._kernel_regularizer = tf.keras.regularizers.get(
            kernel_regularizer)
        self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self._activity_regularizer = tf.keras.regularizers.get(
            activity_regularizer)
        self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self._bias_constraint = tf.keras.constraints.get(bias_constraint)
Ejemplo n.º 3
0
    def __init__(self,
                 num_attention_heads,
                 inner_dim=768,
                 inner_activation=tf_utils.get_activation("gelu"),
                 attention_cls=attention.MultiHeadAttention,
                 attention_cfg=None,
                 feedforward_cls=None,
                 feedforward_cfg=None,
                 dropout_rate=0.0,
                 attention_dropout_rate=0.0,
                 norm_first=False,
                 kernel_initializer="glorot_uniform",
                 bias_initializer="zeros",
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        inner_dim = kwargs.pop("intermediate_size", inner_dim)
        inner_activation = kwargs.pop("inner_activation", inner_activation)
        util.filter_kwargs(kwargs)
        super().__init__(**kwargs)

        self._attention_cfg = attention_cfg
        self._attention_cls = attention_cls
        self._feedforward_cls = feedforward_cls
        self._feedforward_cfg = feedforward_cfg
        self._norm_first = norm_first
        self._num_heads = num_attention_heads
        self._inner_dim = inner_dim
        self._inner_activation = inner_activation
        self._attention_dropout_rate = attention_dropout_rate
        self._dropout_rate = dropout_rate
        self._kernel_initializer = tf.keras.initializers.get(
            kernel_initializer)
        self._bias_initializer = tf.keras.initializers.get(bias_initializer)
        self._kernel_regularizer = tf.keras.regularizers.get(
            kernel_regularizer)
        self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self._bias_constraint = tf.keras.constraints.get(bias_constraint)
Ejemplo n.º 4
0
    def __init__(self,
                 num_attention_heads,
                 inner_dim,
                 inner_activation,
                 output_range=None,
                 kernel_initializer="glorot_uniform",
                 bias_initializer="zeros",
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 use_bias=True,
                 norm_first=False,
                 norm_epsilon=1e-12,
                 output_dropout=0.0,
                 attention_dropout=0.0,
                 inner_dropout=0.0,
                 attention_initializer=None,
                 attention_axes=None,
                 use_query_residual=True,
                 key_dim=None,
                 value_dim=None,
                 output_last_dim=None,
                 diff_q_kv_att_layer_norm=False,
                 **kwargs):
        """Initializes `TransformerEncoderBlock`.

    Note: If `output_last_dim` is used and `use_query_residual` is `True`, the
    `output_last_dim`'s value must equal the first input's last dimension for
    the query residual connection to work. This is because the residual
    connection after the multi-head-attention requires their dimensions to
    match. If `use_query_residual` is `False`, the `output_last_dim` dictactes
    the last dimension of the output of this module and the
    multi-head-attention.

    E.g. let's say input dims are `[batch_size, seq_dim, input_last_dim]`.
    Scenario 1: If `output_last_dim` is not `None`, then the output dims of this
    module would be `[batch_size, seq_dim, output_last_dim]`. Note `key_dim` is
    overriden by `output_last_dim`.
    Scenario 2: If `output_last_dim` is `None` and `key_dim` is not `None`, then
    the output dims of this module would be `[batch_size, seq_dim, key_dim]`.
    Scenario 3: If the `output_last_dim` and `key_dim` are both `None`, the
    output dims would be `[batch_size, seq_dim, input_last_dim]`.

    Args:
      num_attention_heads: Number of attention heads.
      inner_dim: The output dimension of the first Dense layer in a two-layer
        feedforward network.
      inner_activation: The activation for the first Dense layer in a two-layer
        feedforward network.
      output_range: the sequence output range, [0, output_range) for slicing the
        target sequence. `None` means the target sequence is not sliced.
      kernel_initializer: Initializer for dense layer kernels.
      bias_initializer: Initializer for dense layer biases.
      kernel_regularizer: Regularizer for dense layer kernels.
      bias_regularizer: Regularizer for dense layer biases.
      activity_regularizer: Regularizer for dense layer activity.
      kernel_constraint: Constraint for dense layer kernels.
      bias_constraint: Constraint for dense layer kernels.
      use_bias: Whether to enable use_bias in attention layer. If set False,
        use_bias in attention layer is disabled.
      norm_first: Whether to normalize inputs to attention and intermediate
        dense layers. If set False, output of attention and intermediate dense
        layers is normalized.
      norm_epsilon: Epsilon value to initialize normalization layers.
      output_dropout: Dropout probability for the post-attention and output
        dropout.
      attention_dropout: Dropout probability for within the attention layer.
      inner_dropout: Dropout probability for the first Dense layer in a
        two-layer feedforward network.
      attention_initializer: Initializer for kernels of attention layers. If set
        `None`, attention layers use kernel_initializer as initializer for
        kernel.
      attention_axes: axes over which the attention is applied. `None` means
        attention over all axes, but batch, heads, and features.
      use_query_residual: Toggle to execute residual connection after attention.
      key_dim: `key_dim` for the `tf.keras.layers.MultiHeadAttention`. If
        `None`, we use the first `input_shape`'s last dim.
      value_dim: `value_dim` for the `tf.keras.layers.MultiHeadAttention`.
      output_last_dim: Final dimension of the output of this module. This also
        dictates the value for the final dimension of the
        multi-head-attention. When it's `None`, we use, in order of decreasing
        precedence, `key_dim` * `num_heads` or the first `input_shape`'s last
        dim as the output's last dim.
      diff_q_kv_att_layer_norm: If `True`, create a separate attention layer
        norm layer for query and key-value if `norm_first` is `True`. Invalid
        to set to `True` if `norm_first` is `False`.
      **kwargs: keyword arguments.
    """
        util.filter_kwargs(kwargs)
        super().__init__(**kwargs)

        # Deprecation warning.
        if output_range is not None:
            logging.warning(
                "`output_range` is avaliable as an argument for `call()`."
                "The `output_range` as __init__ argument is deprecated.")

        self._num_heads = num_attention_heads
        self._inner_dim = inner_dim
        self._inner_activation = inner_activation
        self._attention_dropout_rate = attention_dropout
        self._output_dropout_rate = output_dropout
        self._output_range = output_range
        self._kernel_initializer = tf.keras.initializers.get(
            kernel_initializer)
        self._bias_initializer = tf.keras.initializers.get(bias_initializer)
        self._kernel_regularizer = tf.keras.regularizers.get(
            kernel_regularizer)
        self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self._activity_regularizer = tf.keras.regularizers.get(
            activity_regularizer)
        self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self._bias_constraint = tf.keras.constraints.get(bias_constraint)
        self._use_bias = use_bias
        self._norm_first = norm_first
        self._norm_epsilon = norm_epsilon
        self._inner_dropout = inner_dropout
        self._use_query_residual = use_query_residual
        self._key_dim = key_dim
        self._value_dim = value_dim
        self._output_last_dim = output_last_dim
        self._diff_q_kv_att_layer_norm = diff_q_kv_att_layer_norm
        if attention_initializer:
            self._attention_initializer = tf.keras.initializers.get(
                attention_initializer)
        else:
            self._attention_initializer = tf_utils.clone_initializer(
                self._kernel_initializer)
        self._attention_axes = attention_axes

        if self._diff_q_kv_att_layer_norm and not self._norm_first:
            raise ValueError(
                "Setting `diff_q_and_kv_attention_layer_norm` to True"
                "when `norm_first` is False is invalid.")
Ejemplo n.º 5
0
  def __init__(self,
               num_attention_heads,
               inner_dim,
               inner_activation,
               output_range=None,
               kernel_initializer="glorot_uniform",
               bias_initializer="zeros",
               kernel_regularizer=None,
               bias_regularizer=None,
               activity_regularizer=None,
               kernel_constraint=None,
               bias_constraint=None,
               use_bias=True,
               norm_first=False,
               norm_epsilon=1e-12,
               output_dropout=0.0,
               attention_dropout=0.0,
               inner_dropout=0.0,
               attention_initializer=None,
               attention_axes=None,
               **kwargs):
    """Initializes `TransformerEncoderBlock`.

    Args:
      num_attention_heads: Number of attention heads.
      inner_dim: The output dimension of the first Dense layer in a two-layer
        feedforward network.
      inner_activation: The activation for the first Dense layer in a two-layer
        feedforward network.
      output_range: the sequence output range, [0, output_range) for slicing the
        target sequence. `None` means the target sequence is not sliced.
      kernel_initializer: Initializer for dense layer kernels.
      bias_initializer: Initializer for dense layer biases.
      kernel_regularizer: Regularizer for dense layer kernels.
      bias_regularizer: Regularizer for dense layer biases.
      activity_regularizer: Regularizer for dense layer activity.
      kernel_constraint: Constraint for dense layer kernels.
      bias_constraint: Constraint for dense layer kernels.
      use_bias: Whether to enable use_bias in attention layer. If set False,
        use_bias in attention layer is disabled.
      norm_first: Whether to normalize inputs to attention and intermediate
        dense layers. If set False, output of attention and intermediate dense
        layers is normalized.
      norm_epsilon: Epsilon value to initialize normalization layers.
      output_dropout: Dropout probability for the post-attention and output
        dropout.
      attention_dropout: Dropout probability for within the attention layer.
      inner_dropout: Dropout probability for the first Dense layer in a
        two-layer feedforward network.
      attention_initializer: Initializer for kernels of attention layers. If set
        `None`, attention layers use kernel_initializer as initializer for
        kernel.
      attention_axes: axes over which the attention is applied. `None` means
        attention over all axes, but batch, heads, and features.
      **kwargs: keyword arguments.
    """
    util.filter_kwargs(kwargs)
    super().__init__(**kwargs)

    self._num_heads = num_attention_heads
    self._inner_dim = inner_dim
    self._inner_activation = inner_activation
    self._attention_dropout = attention_dropout
    self._attention_dropout_rate = attention_dropout
    self._output_dropout = output_dropout
    self._output_dropout_rate = output_dropout
    self._output_range = output_range
    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
    self._use_bias = use_bias
    self._norm_first = norm_first
    self._norm_epsilon = norm_epsilon
    self._inner_dropout = inner_dropout
    if attention_initializer:
      self._attention_initializer = tf.keras.initializers.get(
          attention_initializer)
    else:
      self._attention_initializer = self._kernel_initializer
    self._attention_axes = attention_axes