Esempio n. 1
0
def post_attention(
    h,
    attn_vec,
    d_model,
    n_head,
    d_head,
    dropout,
    is_training,
    kernel_initializer,
    residual=True,
):
    """Post-attention processing."""
    # post-attention projection (back to `d_model`)
    proj_o = tf.get_variable(
        'o/kernel',
        [d_model, n_head, d_head],
        dtype=h.dtype,
        initializer=kernel_initializer,
    )
    attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o)

    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
    if residual:
        output = layers.layer_norm(attn_out + h,
                                   begin_norm_axis=-1,
                                   scope='LayerNorm')
    else:
        output = layers.layer_norm(attn_out,
                                   begin_norm_axis=-1,
                                   scope='LayerNorm')

    return output
Esempio n. 2
0
def layer_norm(input_tensor, name=None):
    """Run layer normalization on the last dimension of the tensor."""
    return layers.layer_norm(
        inputs=input_tensor,
        begin_norm_axis=-1,
        begin_params_axis=-1,
        scope=name,
    )
Esempio n. 3
0
def positionwise_ffn(
    inp,
    d_model,
    d_inner,
    dropout,
    kernel_initializer,
    activation_type='relu',
    scope='ff',
    is_training=True,
    reuse=None,
):
    """Position-wise Feed-forward Network."""
    if activation_type == 'relu':
        activation = tf.nn.relu
    elif activation_type == 'gelu':
        activation = gelu
    else:
        raise ValueError(
            'Unsupported activation type {}'.format(activation_type))

    output = inp
    with tf.variable_scope(scope, reuse=reuse):
        output = tf.layers.dense(
            output,
            d_inner,
            activation=activation,
            kernel_initializer=kernel_initializer,
            name='layer_1',
        )
        output = tf.layers.dropout(output,
                                   dropout,
                                   training=is_training,
                                   name='drop_1')
        output = tf.layers.dense(
            output,
            d_model,
            kernel_initializer=kernel_initializer,
            name='layer_2',
        )
        output = tf.layers.dropout(output,
                                   dropout,
                                   training=is_training,
                                   name='drop_2')
        output = layers.layer_norm(output + inp,
                                   begin_norm_axis=-1,
                                   scope='LayerNorm')
    return output