def __init__(self, num_attention_heads, inner_dim=768, inner_activation=tf_utils.get_activation("gelu"), dropout_rate=0.0, attention_dropout_rate=0.0, output_range=None, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, use_layer_norm=False, share_rezero=True, **kwargs): # attention_dropout will override attention_dropout_rate. # This is to unify the input params with TransformerEncoderBlock. attention_dropout_rate = kwargs.pop("attention_dropout", attention_dropout_rate) dropout_rate = kwargs.pop("output_dropout", dropout_rate) inner_dim = kwargs.pop("intermediate_size", inner_dim) inner_activation = kwargs.pop("intermediate_activation", inner_activation) util.filter_kwargs(kwargs) super().__init__(**kwargs) self._num_heads = num_attention_heads self._inner_dim = inner_dim self._inner_activation = inner_activation self._attention_dropout_rate = attention_dropout_rate self._dropout_rate = dropout_rate self._output_range = output_range self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._use_layer_norm = use_layer_norm self._share_rezero = share_rezero
def __init__(self, inner_dim=768, inner_activation=tf_utils.get_activation("gelu"), dropout=0.0, use_gate=True, apply_output_layer_norm=True, num_blocks=1, dropout_position="before_residual", kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): inner_dim = kwargs.pop("intermediate_size", inner_dim) inner_activation = kwargs.pop("intermediate_activation", inner_activation) util.filter_kwargs(kwargs) super().__init__(**kwargs) self._inner_dim = inner_dim self._inner_activation = inner_activation self._dropout = dropout self._use_gate = use_gate self._num_blocks = num_blocks self._apply_output_layer_norm = apply_output_layer_norm self._dropout_position = dropout_position if self._dropout_position not in ("before_residual", "after_residual"): raise ValueError( "The dropout_position should be either `before_residual` or" "`after_residual`, got: %s" % self._dropout_position) self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._activity_regularizer = tf.keras.regularizers.get( activity_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint)
def __init__(self, num_attention_heads, inner_dim=768, inner_activation=tf_utils.get_activation("gelu"), attention_cls=attention.MultiHeadAttention, attention_cfg=None, feedforward_cls=None, feedforward_cfg=None, dropout_rate=0.0, attention_dropout_rate=0.0, norm_first=False, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): inner_dim = kwargs.pop("intermediate_size", inner_dim) inner_activation = kwargs.pop("inner_activation", inner_activation) util.filter_kwargs(kwargs) super().__init__(**kwargs) self._attention_cfg = attention_cfg self._attention_cls = attention_cls self._feedforward_cls = feedforward_cls self._feedforward_cfg = feedforward_cfg self._norm_first = norm_first self._num_heads = num_attention_heads self._inner_dim = inner_dim self._inner_activation = inner_activation self._attention_dropout_rate = attention_dropout_rate self._dropout_rate = dropout_rate self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint)
def __init__(self, num_attention_heads, inner_dim, inner_activation, output_range=None, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, use_bias=True, norm_first=False, norm_epsilon=1e-12, output_dropout=0.0, attention_dropout=0.0, inner_dropout=0.0, attention_initializer=None, attention_axes=None, use_query_residual=True, key_dim=None, value_dim=None, output_last_dim=None, diff_q_kv_att_layer_norm=False, **kwargs): """Initializes `TransformerEncoderBlock`. Note: If `output_last_dim` is used and `use_query_residual` is `True`, the `output_last_dim`'s value must equal the first input's last dimension for the query residual connection to work. This is because the residual connection after the multi-head-attention requires their dimensions to match. If `use_query_residual` is `False`, the `output_last_dim` dictactes the last dimension of the output of this module and the multi-head-attention. E.g. let's say input dims are `[batch_size, seq_dim, input_last_dim]`. Scenario 1: If `output_last_dim` is not `None`, then the output dims of this module would be `[batch_size, seq_dim, output_last_dim]`. Note `key_dim` is overriden by `output_last_dim`. Scenario 2: If `output_last_dim` is `None` and `key_dim` is not `None`, then the output dims of this module would be `[batch_size, seq_dim, key_dim]`. Scenario 3: If the `output_last_dim` and `key_dim` are both `None`, the output dims would be `[batch_size, seq_dim, input_last_dim]`. Args: num_attention_heads: Number of attention heads. inner_dim: The output dimension of the first Dense layer in a two-layer feedforward network. inner_activation: The activation for the first Dense layer in a two-layer feedforward network. output_range: the sequence output range, [0, output_range) for slicing the target sequence. `None` means the target sequence is not sliced. kernel_initializer: Initializer for dense layer kernels. bias_initializer: Initializer for dense layer biases. kernel_regularizer: Regularizer for dense layer kernels. bias_regularizer: Regularizer for dense layer biases. activity_regularizer: Regularizer for dense layer activity. kernel_constraint: Constraint for dense layer kernels. bias_constraint: Constraint for dense layer kernels. use_bias: Whether to enable use_bias in attention layer. If set False, use_bias in attention layer is disabled. norm_first: Whether to normalize inputs to attention and intermediate dense layers. If set False, output of attention and intermediate dense layers is normalized. norm_epsilon: Epsilon value to initialize normalization layers. output_dropout: Dropout probability for the post-attention and output dropout. attention_dropout: Dropout probability for within the attention layer. inner_dropout: Dropout probability for the first Dense layer in a two-layer feedforward network. attention_initializer: Initializer for kernels of attention layers. If set `None`, attention layers use kernel_initializer as initializer for kernel. attention_axes: axes over which the attention is applied. `None` means attention over all axes, but batch, heads, and features. use_query_residual: Toggle to execute residual connection after attention. key_dim: `key_dim` for the `tf.keras.layers.MultiHeadAttention`. If `None`, we use the first `input_shape`'s last dim. value_dim: `value_dim` for the `tf.keras.layers.MultiHeadAttention`. output_last_dim: Final dimension of the output of this module. This also dictates the value for the final dimension of the multi-head-attention. When it's `None`, we use, in order of decreasing precedence, `key_dim` * `num_heads` or the first `input_shape`'s last dim as the output's last dim. diff_q_kv_att_layer_norm: If `True`, create a separate attention layer norm layer for query and key-value if `norm_first` is `True`. Invalid to set to `True` if `norm_first` is `False`. **kwargs: keyword arguments. """ util.filter_kwargs(kwargs) super().__init__(**kwargs) # Deprecation warning. if output_range is not None: logging.warning( "`output_range` is avaliable as an argument for `call()`." "The `output_range` as __init__ argument is deprecated.") self._num_heads = num_attention_heads self._inner_dim = inner_dim self._inner_activation = inner_activation self._attention_dropout_rate = attention_dropout self._output_dropout_rate = output_dropout self._output_range = output_range self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._activity_regularizer = tf.keras.regularizers.get( activity_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._use_bias = use_bias self._norm_first = norm_first self._norm_epsilon = norm_epsilon self._inner_dropout = inner_dropout self._use_query_residual = use_query_residual self._key_dim = key_dim self._value_dim = value_dim self._output_last_dim = output_last_dim self._diff_q_kv_att_layer_norm = diff_q_kv_att_layer_norm if attention_initializer: self._attention_initializer = tf.keras.initializers.get( attention_initializer) else: self._attention_initializer = tf_utils.clone_initializer( self._kernel_initializer) self._attention_axes = attention_axes if self._diff_q_kv_att_layer_norm and not self._norm_first: raise ValueError( "Setting `diff_q_and_kv_attention_layer_norm` to True" "when `norm_first` is False is invalid.")
def __init__(self, num_attention_heads, inner_dim, inner_activation, output_range=None, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, use_bias=True, norm_first=False, norm_epsilon=1e-12, output_dropout=0.0, attention_dropout=0.0, inner_dropout=0.0, attention_initializer=None, attention_axes=None, **kwargs): """Initializes `TransformerEncoderBlock`. Args: num_attention_heads: Number of attention heads. inner_dim: The output dimension of the first Dense layer in a two-layer feedforward network. inner_activation: The activation for the first Dense layer in a two-layer feedforward network. output_range: the sequence output range, [0, output_range) for slicing the target sequence. `None` means the target sequence is not sliced. kernel_initializer: Initializer for dense layer kernels. bias_initializer: Initializer for dense layer biases. kernel_regularizer: Regularizer for dense layer kernels. bias_regularizer: Regularizer for dense layer biases. activity_regularizer: Regularizer for dense layer activity. kernel_constraint: Constraint for dense layer kernels. bias_constraint: Constraint for dense layer kernels. use_bias: Whether to enable use_bias in attention layer. If set False, use_bias in attention layer is disabled. norm_first: Whether to normalize inputs to attention and intermediate dense layers. If set False, output of attention and intermediate dense layers is normalized. norm_epsilon: Epsilon value to initialize normalization layers. output_dropout: Dropout probability for the post-attention and output dropout. attention_dropout: Dropout probability for within the attention layer. inner_dropout: Dropout probability for the first Dense layer in a two-layer feedforward network. attention_initializer: Initializer for kernels of attention layers. If set `None`, attention layers use kernel_initializer as initializer for kernel. attention_axes: axes over which the attention is applied. `None` means attention over all axes, but batch, heads, and features. **kwargs: keyword arguments. """ util.filter_kwargs(kwargs) super().__init__(**kwargs) self._num_heads = num_attention_heads self._inner_dim = inner_dim self._inner_activation = inner_activation self._attention_dropout = attention_dropout self._attention_dropout_rate = attention_dropout self._output_dropout = output_dropout self._output_dropout_rate = output_dropout self._output_range = output_range self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._use_bias = use_bias self._norm_first = norm_first self._norm_epsilon = norm_epsilon self._inner_dropout = inner_dropout if attention_initializer: self._attention_initializer = tf.keras.initializers.get( attention_initializer) else: self._attention_initializer = self._kernel_initializer self._attention_axes = attention_axes