def EncoderBlock(d_model, d_ff, n_heads, dropout, layer_idx, mode): """Returns a layer sequence that implements a Transformer encoder block. The input to the layer sequence is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) layer_idx: which layer are we at (for bookkeeping) mode: str: 'train' or 'eval' Returns: A sequence of layers that maps an (activations, mask) pair to an (activations, mask) pair. """ attention = [ tl.LayerNorm(), tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, name='enc_attn_dropout', mode=mode), ] feed_forward = [ FeedForward(d_model, d_ff, dropout, layer_idx=layer_idx, mode=mode), ] return tl.Serial( tl.Residual(attention), tl.Residual(feed_forward), )
def EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, FeedForwardBlock=FeedForwardBlock): """ Returns a list of layers that implements a Transformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model (int): depth of embedding. d_ff (int): depth of feed-forward layer. n_heads (int): number of attention heads. dropout (float): dropout rate (how much to drop out). dropout_shared_axes (int): axes on which to share dropout mask. mode (str): 'train' or 'eval'. ff_activation (function): the non-linearity in feed-forward layer. FeedForwardBlock (function): A function that returns the feed forward block. Returns: list: A list of layers that maps (activations, mask) to (activations, mask). """ # Attention block attention = tl.Attention( # dimension of the model d_feature=d_model, # number of attention heads n_heads=n_heads, # `dropout` dropout=dropout, # `mode` mode=mode) # calling function `FeedForwardBlock feed_forward = FeedForwardBlock(d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation) # Dropout block dropout_ = tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) encoder_block = [ # `Residual` layer tl.Residual( tl.LayerNorm(), attention, dropout_, ), tl.Residual(feed_forward, ), ] return encoder_block
def _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation): """Returns a list of layers that implements a Transformer encoder block. The input to the block is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. Returns: A list of layers that maps (activations, mask) to (activations, mask). """ attention = tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode) feed_forward = _FeedForwardBlock(d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation) dropout_ = tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) return [ tl.Residual( tl.LayerNorm(), attention, dropout_, ), tl.Residual(feed_forward), ]
def _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation): """Returns a list of layers that implements a Transformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) dropout_shared_axes: axes on which to share dropout mask mode: str: 'train' or 'eval' ff_activation: the non-linearity in feed-forward layer Returns: A list of layers that maps (activations, mask) to (activations, mask). """ attention = tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode) feed_forward = _FeedForwardBlock(d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation) dropout_ = tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) return [ ResidualZero( tl.LayerNorm(), attention, dropout_, ), ResidualZero( tl.LayerNorm(), feed_forward, dropout_, ), ]
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode): """Returns a list of layers that implements a Reformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer mode: str: 'train' or 'eval' Returns: A list of layers that maps (activations, mask) to (activations, mask). """ pre_attention = tl.LayerNorm() attention = tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode) post_attention = tl.Dropout(rate=dropout, name='dropout_enc_attn', mode=mode) # TODO(kitaev): Switch to FeedForward with BroadcastedDropout? feed_forward = transformer._FeedForwardBlock( # pylint: disable=protected-access d_model, d_ff, dropout, -1, mode, ff_activation) # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode) return [ # TODO(kitaev): consider ReversibleAttentionHalfResidual for efficiency ReversibleHalfResidual([pre_attention, attention, post_attention]), tl.ReversibleSwap(), ReversibleHalfResidual(feed_forward), tl.ReversibleSwap(), ]
def _Attention(): return tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode)
def EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, FeedForwardBlock=FeedForwardBlock): """ Returns a list of layers that implements a Transformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model (int): depth of embedding. d_ff (int): depth of feed-forward layer. n_heads (int): number of attention heads. dropout (float): dropout rate (how much to drop out). dropout_shared_axes (int): axes on which to share dropout mask. mode (str): 'train' or 'eval'. ff_activation (function): the non-linearity in feed-forward layer. FeedForwardBlock (function): A function that returns the feed forward block. Returns: list: A list of layers that maps (activations, mask) to (activations, mask). """ ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ### # Attention block attention = tl.Attention( # Use dimension of the model d_feature=d_model, # Set it equal to number of attention heads n_heads=n_heads, # Set it equal `dropout` dropout=dropout, # Set it equal `mode` mode=mode ) # Call the function `FeedForwardBlock` (implemented before) and pass in the parameters feed_forward = FeedForwardBlock( d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation ) # Dropout block dropout_ = tl.Dropout( # set it equal to `dropout` rate=dropout, # set it equal to the axes on which to share dropout mask shared_axes=dropout_shared_axes, # set it equal to `mode` mode=mode ) encoder_block = [ # add `Residual` layer tl.Residual( # add norm layer tl.LayerNorm(), # add attention attention, # add dropout dropout_, ), # add another `Residual` layer tl.Residual( # add feed forward feed_forward, ), ] ### END CODE HERE ### return encoder_block