def __init__(self, filter_size, output_size, dropout_rate, activation="relu", name="ffn"): """ Initializes Transformer FFN. Args: filter_size: The hidden size of the relu layer. output_size: The output size. dropout_rate: The dropout rate. activation: The activation of internal layer. name: The name of this layer. """ super(TransformerFFN, self).__init__(name=name) self._dropout_rate = dropout_rate self._filter_size = filter_size self._output_size = output_size self._activation = activation self._activation_fn = get_activation(activation) self._conv1 = None self._conv2 = None
def __init__(self, output_units, num_heads, kernel_initializer=None, bias_initializer="zeros", activation=None, use_bias=True, is_output_transform=False, name="transform"): """ Initializes MultiHeadDenseLayer. Args: output_units: A int scalar or int list, indicating the transformed output units. It must be a int scalar when `is_output_transform` is True. num_heads: The head num. kernel_initializer: The initializer of kernel weight. bias_initializer: The initializer of bias. activation: A string or a callable function for activation. use_bias: A boolean, whether to add bias tensor. is_output_transform: A boolean, whether to use this layer for the output transformation in multi head attention. name: The name of the layer. """ super(MultiHeadDenseLayer, self).__init__(name=name) self._output_units = output_units self._num_heads = num_heads self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias self._is_output_transform = is_output_transform self._activation = activation self._activation_fn = get_activation(activation) # compatible self._flatten_output_units = tf.nest.flatten(self._output_units) if is_output_transform: assert not tf.nest.is_nested(self._output_units)