def EncoderBlock(d_feature, d_feedforward, n_heads, dropout, mode): """Transformer encoder block. The input to the encoder is a pair (embedded source, mask) where the mask is created from the original source to prevent attending to the padding part of the input. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a pair (activations, mask). """ attention = [ tl.LayerNorm(), tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, mode=mode), ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ tl.Residual(attention), tl.Residual(feed_forward), ]
def EncoderBlock(d_model, d_ff, n_heads, dropout, layer_idx, mode): """Returns a layer sequence that implements a Transformer encoder block. The input to the layer sequence is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) layer_idx: which layer are we at (for bookkeeping) mode: str: 'train' or 'eval' Returns: A sequence of layers that maps an (activations, mask) pair to an (activations, mask) pair. """ attention = [ tl.LayerNorm(), tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, name='enc_attn_dropout', mode=mode), ] feed_forward = [ FeedForward(d_model, d_ff, dropout, layer_idx=layer_idx, mode=mode), ] return [ tl.Residual(attention), tl.Residual(feed_forward), ]
def EncoderBlock(d_feature, d_feedforward, n_heads, dropout, mode): """Returns a layer sequence that implements a Transformer encoder block. The input to the layer sequence is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: A sequence of layers that maps an (activations, mask) pair to an (activations, mask) pair. """ attention = [ tl.LayerNorm(), tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, mode=mode), ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ tl.Residual(attention), tl.Residual(feed_forward), ]
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode): """Returns a layer sequence that implements a Transformer decoder block. The input to the layer sequence is an activation tensor. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: A sequence of layers that maps an activation tensor to an activation tensor. """ self_attention = [ tl.LayerNorm(), # vec tl.Dup(), # vec vec tl.Parallel([], tl.CausalMask(axis=-2)), # vec mask tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Parallel([], tl.Drop()), # vec tl.Dropout(rate=dropout, mode=mode), # vec ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ tl.Residual(self_attention), tl.Residual(feed_forward), ]
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode): """Transformer decoder layer. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ self_attention = [ tl.LayerNorm(), tl.Branch([], tl.CausalMask(axis=-2)), # Create mask. tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Select(0), # Drop mask. tl.Dropout(rate=dropout, mode=mode), ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ tl.Residual(self_attention), tl.Residual(feed_forward), ]
def DecoderBlock(d_model, d_ff, n_heads, dropout, mode): """Returns a layer sequence that implements a Transformer decoder block. The input to the layer sequence is an activation tensor. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: A sequence of layers that maps an activation tensor to an activation tensor. """ self_attention = [ tl.LayerNorm(), # vec tl.BasicCausalAttention( d_model, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, mode=mode), # vec ] feed_forward = [ FeedForward(d_model, d_ff, dropout, mode=mode), ] return [ tl.Residual(self_attention), tl.Residual(feed_forward), ]
def WideResnetGroup(n, channels, strides=(1, 1)): shortcut = [ tl.Conv(channels, (3, 3), strides, padding='SAME'), ] return [ tl.Residual(WideResnetBlock(channels, strides), shortcut=shortcut), tl.Residual([WideResnetBlock(channels, (1, 1)) for _ in range(n - 1)]), ]
def EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple pair (decoder_input, mask, encoder) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (decoder_activations, mask, encoder). """ decoder_self_attention = [ # TODO(jonni): Work on combinators so that this flow is cleaner/clearer. tl.LayerNorm(), tl.Dup(), tl.CausalMask(axis=-2), # Create the self-attention mask. tl.Swap(), # Put mask behind the activations. tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Swap(), # Put self-attention mask on top. tl.Drop(), # Drop self-attention mask. tl.Dropout(rate=dropout, mode=mode), ] decoder_to_encoder_attention = [ tl.Select((0, 2, 2, 1, 2)), # (dec, enc, enc, mask, enc-copy) tl. MultiHeadedAttentionQKV( # (q, k, v, mask, ...) --> (new, mask, ...) d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, mode=mode), ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ tl.Residual(decoder_self_attention), tl.Residual(decoder_to_encoder_attention), tl.Residual(feed_forward), ]
def EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple (decoder_input, mask, encoder) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (decoder_activations, mask, encoder). """ decoder_self_attention = [ # vecs_d pmask vecs_e tl.LayerNorm(), # vecs_d ..... ...... tl.Dup(), # vecs_d vecs_d ..... ...... tl.Parallel([], tl.CausalMask(axis=-2)), # ______ masks ..... ...... tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Parallel([], tl.Drop()), # ______ 0 ..... ...... tl.Dropout(rate=dropout, mode=mode), # vecs_d ..... ...... ] decoder_to_encoder_attention = [ # vecs_d masks vecs_e tl.Parallel([], [], tl.Dup()), # ______ _____ vecs_e vecs_e tl.Parallel([], tl.Swap()), # ______ vecs_e masks ...... tl.Parallel([], tl.Dup()), # ______ vecs_e vecs_e ..... ...... tl.MultiHeadedAttentionQKV( # (q k v masks ... --> vecs_d masks ...) d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, mode=mode), # vecs_d mask vecs_e ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ # vecs_d masks vecs_e tl.Residual(decoder_self_attention), # vecs_d masks vecs_e tl.Residual(decoder_to_encoder_attention), # vecs_d masks vecs_e tl.Residual(feed_forward), # vecs_d masks vecs_e ]
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple pair (decoder_input, mask, encoder) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (decoder_activations, mask, encoder). """ # Decoder self-attending to decoder. self_attention = tl.Residual( tl.LayerNorm(), tl.Dup(), tl.CausalMask(axis=-2), # Create the self-attention mask. tl.Swap(), # Put mask behind the activations. tl.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Swap(), # Put self-attention mask on top. tl.Drop(), # Drop self-attention mask. tl.Dropout(rate=dropout, mode=mode)) # Decoder attending to encoder. encoder_decoder_attention = tl.Serial( tl.Select((0, 2, 2, 1, 2)), # (dec, enc, enc, mask, enc-copy) tl. MultiHeadedAttentionQKV( # (q, k, v, mask, ...) --> (new, mask, ...) feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Dropout(rate=dropout, mode=mode), ) return tl.Serial( self_attention, tl.Residual(encoder_decoder_attention), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode))
def DecoderLayer(positions, d_feature, d_feedforward, n_heads, dropout, mode): """Transformer decoder layer. Args: positions: random vectors for positions d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ return [ tl.Residual( # Self-attention block. PreservePosition(tl.LayerNorm()), tl.Dup(), tl.Parallel( [], # activation for (q, k, v) tl.CausalMask(axis=-2)), # attention mask MultiHeadedAttentionPosition(positions, d_feature, n_heads=n_heads, dropout=dropout, mode=mode), PreservePosition(tl.Dropout(rate=dropout, mode=mode))), ResidualFeedForward(d_feature, d_feedforward, dropout, mode=mode) ]
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer decoder layer. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ return layers.Serial( layers.Residual( # Self-attention block. layers.LayerNorm(), layers.Branch(), layers.Parallel( layers.Identity(), # activation for (q, k, v) layers.CausalMask(axis=-2)), # attention mask layers.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode)), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode))
def Encoder(source, source_mask): """Transformer encoder stack. Args: source: layer variable: raw source sequences source_mask: layer variable: self-attention mask Returns: Layer variable that outputs encoded source. """ encoder_layer = layers.Serial( # input attends to self layers.Residual( layers.LayerNorm(), layers.Branch(size=4), layers.Parallel( layers.Identity(), # query layers.Identity(), # key layers.Identity(), # value source_mask), # attention mask multi_attention, layers.Dropout(dropout, mode=mode)), # feed-forward ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode), ) return layers.Serial( source, source_embedding_layer, layers.repeat(encoder_layer, num_layers), layers.LayerNorm(), )
def ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode): """Residual feed-forward layer with normalization at start.""" return layers.Residual(layers.LayerNorm(), layers.Dense(feedforward_depth), layers.Relu(), layers.Dropout(rate=dropout, mode=mode), layers.Dense(feature_depth), layers.Dropout(rate=dropout, mode=mode))
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer decoder layer. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ return tl.Serial( tl.Residual( # Self-attention block. tl.LayerNorm(), tl.Branch(tl.Copy(), tl.CausalMask(axis=-2)), # Create mask. tl.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Select(0), # Drop the mask. tl.Dropout(rate=dropout, mode=mode)), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode))
def EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder layer. The input to the encoder is a pair (embedded source, mask) where the mask is created from the original source to prevent attending to the padding part of the input. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a pair (actiavtions, mask). """ return tl.Serial( tl.Residual( # Attention block here. tl.Parallel(tl.LayerNorm(), tl.Copy()), tl.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Parallel(tl.Dropout(rate=dropout, mode=mode), tl.Copy())), tl.Parallel( ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode), tl.Div( divisor=2.0) # Mask added to itself in the residual, divide. ))
def ChunkedDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, chunk_selector, mode): """Transformer decoder layer operating on chunks. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) chunk_selector: a function from chunk number to list of chunks to attend. mode: str: 'train' or 'eval' Returns: the layer. """ return tl.Serial( tl.Residual( # Self-attention block. tl.Map(tl.LayerNorm()), ChunkedCausalMultiHeadedAttention( feature_depth, num_heads=num_heads, dropout=dropout, chunk_selector=chunk_selector, mode=mode), tl.Map(tl.Dropout(rate=dropout, mode=mode)), ), tl.Map(ResidualFeedForward( feature_depth, feedforward_depth, dropout, mode=mode)) )
def Decoder(memory, target, target_mask, memory_mask): """Transformer decoder stack. Args: memory: layer variable: encoded source sequences target: layer variable: raw target sequences target_mask: layer variable: self-attention mask memory_mask: layer variable: memory attention mask Returns: Layer variable that outputs encoded source. """ decoder_layer = layers.Serial( # target attends to self layers.Residual( layers.LayerNorm(), layers.Branch(size=4), layers.Parallel( layers.Identity(), # query layers.Identity(), # key layers.Identity(), # value target_mask), # attention mask multi_attention, layers.Dropout(dropout, mode=mode)), # target attends to encoded source layers.Residual( layers.LayerNorm(), layers.Branch(size=4), layers.Parallel( layers.Identity(), # query memory, # key memory, # value memory_mask), # attention mask multi_attention, layers.Dropout(dropout, mode=mode)), # feed-forward ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode)) return layers.Serial( target, target_embedding_layer, layers.repeat(decoder_layer, num_layers), layers.LayerNorm(), )
def WideResnetBlock(channels, strides=(1, 1), channel_mismatch=False): """WideResnet convolutational block.""" main = tl.Serial(tl.BatchNorm(), tl.Relu(), tl.Conv(channels, (3, 3), strides, padding='SAME'), tl.BatchNorm(), tl.Relu(), tl.Conv(channels, (3, 3), padding='SAME')) shortcut = tl.Copy() if not channel_mismatch else tl.Conv( channels, (3, 3), strides, padding='SAME') return tl.Residual(main, shortcut=shortcut)
def IdentityBlock(kernel_size, filters): """ResNet identical size block.""" ks = kernel_size filters1, filters2, filters3 = filters main = tl.Serial(tl.Conv(filters1, (1, 1)), tl.BatchNorm(), tl.Relu(), tl.Conv(filters2, (ks, ks), padding='SAME'), tl.BatchNorm(), tl.Relu(), tl.Conv(filters3, (1, 1)), tl.BatchNorm()) return tl.Serial(tl.Residual(main), tl.Relu())
def ConvBlock(kernel_size, filters, strides): """ResNet convolutional striding block.""" ks = kernel_size filters1, filters2, filters3 = filters main = tl.Serial(tl.Conv(filters1, (1, 1), strides), tl.BatchNorm(), tl.Relu(), tl.Conv(filters2, (ks, ks), padding='SAME'), tl.BatchNorm(), tl.Relu(), tl.Conv(filters3, (1, 1)), tl.BatchNorm()) shortcut = tl.Serial(tl.Conv(filters3, (1, 1), strides), tl.BatchNorm()) return tl.Serial(tl.Residual(main, shortcut=shortcut), tl.Relu())
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple pair (encoder, mask, decoder_input) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (encoder, mask, decoder_activations). """ # Decoder self-attending to decoder. self_attention = layers.Residual( layers.LayerNorm(), layers.Branch(), layers.Parallel( layers.Identity(), # activation for (q, k, v) layers.CausalMask(axis=-2)), # attention mask layers.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode)) # Decoder attending to encoder. encoder_decoder_attention = layers.Serial( layers.Reorder(output=((2, 0, 0), 1)), # ((dec, enc, enc), mask) layers.MultiHeadedAttentionQKV( # ((q, k, v), mask) --> new v feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode), ) return layers.Serial( layers.Parallel(layers.Identity(), layers.Identity(), self_attention), layers.Branch(), layers.Parallel(layers.Identity(), encoder_decoder_attention), layers.UnnestBranches(), # (encoder, mask, old_act, new_act) layers.Reorder(output=(0, 1, (2, 3))), layers.Parallel( # Residual after encoder-decoder attention. layers.Identity(), layers.Identity(), layers.SumBranches()), layers.Parallel( # Feed-forward on the third component (decoder). layers.Identity(), layers.Identity(), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode)))
def WideResnetGroup(n, channels, strides=(1, 1), bn_momentum=0.9, mode='train'): shortcut = [ tl.Conv(channels, (3, 3), strides, padding='SAME'), ] return [ tl.Residual(WideResnetBlock(channels, strides, bn_momentum=bn_momentum, mode=mode), shortcut=shortcut), tl.Residual([ WideResnetBlock(channels, (1, 1), bn_momentum=bn_momentum, mode=mode) for _ in range(n - 1) ]), ]
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple pair (encoder, mask, decoder_input) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (encoder, mask, decoder_activations). """ # Decoder self-attending to decoder. self_attention = tl.Residual( tl.LayerNorm(), tl.Branch(tl.NoOp(), tl.CausalMask(axis=-2)), # create mask tl.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Select(0), # drop mask tl.Dropout(rate=dropout, mode=mode)) # Decoder attending to encoder. encoder_decoder_attention = tl.Serial( tl.Select((2, 0, 0, 1)), # (dec, enc, enc, mask) tl.MultiHeadedAttentionQKV( # (q, k, v, mask) --> new, mask feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Select(0), # drop the mask tl.Dropout(rate=dropout, mode=mode), ) return tl.Serial( tl.Parallel(tl.NoOp(), tl.NoOp(), self_attention), tl.Branch(tl.NoOp(), encoder_decoder_attention), tl.Select(inputs=(('encoder', 'mask', 'old_act'), 'new_act'), output=('encoder', 'mask', ('old_act', 'new_act'))), tl.Parallel( # Residual after encoder-decoder attention. tl.NoOp(), tl.NoOp(), tl.Add()), tl.Parallel( # Feed-forward on the third component (decoder). tl.NoOp(), tl.NoOp(), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode)))
def DecoderBlock(d_model, d_ff, n_heads, d_attention_key, d_attention_value, attention_type, dropout, share_kv, layer_idx, mode): """Returns a layer sequence that implements a Transformer decoder block. The input to the layer sequence is an activation tensor. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head attention_type: subclass of tl.BaseCausalAttention: attention class to use dropout: float: dropout rate (how much to drop out) share_kv: bool, whether to share keys and values layer_idx: which layer are we at (for bookkeeping) mode: str: 'train' or 'eval' Returns: A sequence of layers that maps an activation tensor to an activation tensor. """ self_attention = [ tl.LayerNorm(), # vec tl.CausalAttention(d_model, n_heads=n_heads, d_attention_key=d_attention_key, d_attention_value=d_attention_value, attention_type=attention_type, share_kv=share_kv, mode=mode), tl.Dropout(rate=dropout, name='attention_%d' % layer_idx, mode=mode), ] feed_forward = [ FeedForward(d_model, d_ff, dropout, layer_idx=layer_idx, mode=mode), ] return [ tl.Residual(self_attention), tl.Residual(feed_forward), ]
def ResidualFeedForward(d_feature, d_feedforward, dropout, mode): """Residual feed-forward layer with normalization at start.""" stack = tl.Serial( tl.LayerNorm(), tl.Dense(d_feedforward), tl.Relu(), tl.Dropout(rate=dropout, mode=mode), tl.Dense(d_feature), tl.Dropout(rate=dropout, mode=mode) ) return tl.Residual(PreservePosition(stack))
def ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode): """Residual feed-forward layer with normalization at start.""" return layers.Residual( layers.LayerNorm(), layers.Dense(feedforward_depth, kernel_initializer=layers.XavierUniformInitializer()), layers.Relu(), layers.Dropout(rate=dropout, mode=mode), layers.Dense(feature_depth, kernel_initializer=layers.XavierUniformInitializer()), layers.Dropout(rate=dropout, mode=mode) )
def IdentityBlock(kernel_size, filters, mode='train'): """ResNet identical size block.""" # TODO(jonni): Use good defaults so Resnet50 code is cleaner / less redundant. ks = kernel_size filters1, filters2, filters3 = filters main = [ tl.Conv(filters1, (1, 1)), tl.BatchNorm(mode=mode), tl.Relu(), tl.Conv(filters2, (ks, ks), padding='SAME'), tl.BatchNorm(mode=mode), tl.Relu(), tl.Conv(filters3, (1, 1)), tl.BatchNorm(mode=mode), ] return [ tl.Residual(main), tl.Relu(), ]
def EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder layer. The input to the encoder is a pair (embedded source, mask) where the mask is created from the original source to prevent attending to the padding part of the input. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a pair (actiavtions, mask). """ # The encoder block expects (activation, mask) as input and returns # the new activations only, we add the mask back to output next. encoder_block = layers.Serial( layers.Residual( # Attention block here. layers.Parallel(layers.LayerNorm(), layers.Identity()), layers.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode), shortcut=layers.FirstBranch() ), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode) ) # Now we add the mask back. return layers.Serial( layers.Reorder(output=((0, 1), 1)), # (x, mask) --> ((x, mask), mask) layers.Parallel(encoder_block, layers.Identity()) )
def ConvBlock(kernel_size, filters, strides, mode='train'): """ResNet convolutional striding block.""" # TODO(jonni): Use good defaults so Resnet50 code is cleaner / less redundant. ks = kernel_size filters1, filters2, filters3 = filters main = [ tl.Conv(filters1, (1, 1), strides), tl.BatchNorm(mode=mode), tl.Relu(), tl.Conv(filters2, (ks, ks), padding='SAME'), tl.BatchNorm(mode=mode), tl.Relu(), tl.Conv(filters3, (1, 1)), tl.BatchNorm(mode=mode), ] shortcut = [ tl.Conv(filters3, (1, 1), strides), tl.BatchNorm(mode=mode), ] return [ tl.Residual(main, shortcut=shortcut), tl.Relu(), ]