def post_attention( h, attn_vec, d_model, n_head, d_head, dropout, is_training, kernel_initializer, residual=True, ): """Post-attention processing.""" # post-attention projection (back to `d_model`) proj_o = tf.get_variable( 'o/kernel', [d_model, n_head, d_head], dtype=h.dtype, initializer=kernel_initializer, ) attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o) attn_out = tf.layers.dropout(attn_out, dropout, training=is_training) if residual: output = layers.layer_norm(attn_out + h, begin_norm_axis=-1, scope='LayerNorm') else: output = layers.layer_norm(attn_out, begin_norm_axis=-1, scope='LayerNorm') return output
def layer_norm(input_tensor, name=None): """Run layer normalization on the last dimension of the tensor.""" return layers.layer_norm( inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name, )
def positionwise_ffn( inp, d_model, d_inner, dropout, kernel_initializer, activation_type='relu', scope='ff', is_training=True, reuse=None, ): """Position-wise Feed-forward Network.""" if activation_type == 'relu': activation = tf.nn.relu elif activation_type == 'gelu': activation = gelu else: raise ValueError( 'Unsupported activation type {}'.format(activation_type)) output = inp with tf.variable_scope(scope, reuse=reuse): output = tf.layers.dense( output, d_inner, activation=activation, kernel_initializer=kernel_initializer, name='layer_1', ) output = tf.layers.dropout(output, dropout, training=is_training, name='drop_1') output = tf.layers.dense( output, d_model, kernel_initializer=kernel_initializer, name='layer_2', ) output = tf.layers.dropout(output, dropout, training=is_training, name='drop_2') output = layers.layer_norm(output + inp, begin_norm_axis=-1, scope='LayerNorm') return output