def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='predict'): # create embedding layer embed_layer = tl.Embedding( vocab_size=vocab_size, # Size of the vocabulary d_feature=embedding_dim) # Embedding dimension # Create a mean layer, to create an "average" word embedding mean_layer = tl.Mean(axis=1) # Create a dense layer, one unit for each output dense_output_layer = tl.Dense(n_units = output_dim) # Create the log softmax layer (no parameters needed) log_softmax_layer = tl.LogSoftmax() # Use tl.Serial to combine all layers # and create the classifier # of type trax.layers.combinators.Serial model = tl.Serial( embed_layer, # embedding layer mean_layer, # mean layer dense_output_layer, # dense output layer log_softmax_layer # log softmax layer ) # return the model of type return model
def TransformerEncoder(vocab_size=vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu, EncoderBlock=EncoderBlock): """ Returns a Transformer encoder model. The input to the model is a tensor of tokens. Args: vocab_size (int): vocab size. Defaults to vocab_size. n_classes (int): how many classes on output. Defaults to 10. d_model (int): depth of embedding. Defaults to 512. d_ff (int): depth of feed-forward layer. Defaults to 2048. n_layers (int): number of encoder/decoder layers. Defaults to 6. n_heads (int): number of attention heads. Defaults to 8. dropout (float): dropout rate (how much to drop out). Defaults to 0.1. dropout_shared_axes (int): axes on which to share dropout mask. Defaults to None. max_len (int): maximum symbol length for positional encoding. Defaults to 2048. mode (str): 'train' or 'eval'. Defaults to 'train'. ff_activation (function): the non-linearity in feed-forward layer. Defaults to tl.Relu. EncoderBlock (function): Returns the encoder block. Defaults to EncoderBlock. Returns: trax.layers.combinators.Serial: A Transformer model as a layer that maps from a tensor of tokens to activations over a set of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] # repeatation of Encoder block upto number of layers encoder_blocks = [ EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for _ in range(n_layers) ] # Encoder Model return tl.Serial( tl.Branch( positional_encoder, tl.PaddingMask(), ), encoder_blocks, tl.Select([0], n_in=2), tl.LayerNorm(), tl.Mean(axis=1), tl.Dense(n_classes), tl.LogSoftmax(), )
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'): """Returns a Siamese model. Args: vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab). d_model (int, optional): Depth of the model. Defaults to 128. mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to 'train'. Returns: trax.layers.combinators.Parallel: A Siamese model. """ def normalize(x): # normalizes the vectors to have L2 norm 1 return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True)) ### START CODE HERE (Replace instances of 'None' with your code) ### q_processor = tl.Serial( # Processor will run on Q1 and Q2. tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Embedding layer tl.LSTM(n_units=d_model), # LSTM layer tl.Mean(axis=1), # Mean over columns tl.Fn('Normalize', lambda x: normalize(x)) # Apply normalize function ) # Returns one vector of shape [batch_size, d_model]. ### END CODE HERE ### # Run on Q1 and Q2 in parallel. model = tl.Parallel(q_processor, q_processor) return model
def classifier(vocab_size=1, embedding_dim=256, output_dim=2, mode='train'): embed_layer = tl.Embedding(vocab_size=vocab_size, d_feature=embedding_dim) mean_layer = tl.Mean(axis=1) dense_output_layer = tl.Dense(n_units=output_dim) log_softmax_layer = tl.LogSoftmax() model = tl.Serial(embed_layer, mean_layer, dense_output_layer, log_softmax_layer) return model
def TransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu): """Returns a Transformer encoder model. The input to the model is a tensor of tokens. Args: vocab_size: int: vocab size n_classes: how many classes on output d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) dropout_shared_axes: axes on which to share dropout mask max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' ff_activation: the non-linearity in feed-forward layer Returns: A Transformer model as a layer that maps from a tensor of tokens to activations over a set of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] encoder_blocks = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_layers) ] # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. tl.Mean(axis=1), # vecs tl.Dense(n_classes), # vecs tl.LogSoftmax(), # vecs )
def TransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train', ff_activation=tl.Relu): """Returns a Transformer encoder model. The input to the model is a tensor of tokens. Args: vocab_size: int: vocab size n_classes: how many classes on output d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' ff_activation: the non-linearity in feed-forward layer Returns: A Transformer model as a layer that maps from a tensor of tokens to activations over a set of output classes. """ embedder = [ tl.Embedding(d_model, vocab_size), tl.Dropout(rate=dropout, name='emb_dropout', mode=mode), tl.PositionalEncoding(max_len=max_len), ] return tl.Serial( # tokens tl.Dup(), # toks toks tl.Parallel(embedder, tl.PaddingMask()), # vecs mask [ EncoderBlock(d_model, d_ff, n_heads, dropout, i, mode, ff_activation) for i in range(n_layers) ], # vecs mask tl.Parallel([], tl.Drop()), # ____ 0 tl.LayerNorm(), # vecs tl.Mean(axis=1), # Average on length. # vecs tl.Dense(n_classes), # vecs tl.LogSoftmax(), # vecs )
def siamese(vocab_size, d_model=128): """Returns a Siamese model. Args: vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab). d_model (int, optional): Depth of the model. Defaults to 128. Returns: trax.layers.combinators.Parallel: A Siamese model. """ def normalize(vec): # normalizes the vectors to have L2 norm 1 return vec / fastnp.sqrt(fastnp.sum(vec * vec, axis=-1, keepdims=True)) s_processor = tl.Serial( tl.Embedding(vocab_size, d_model), # Embedding layer tl.LSTM(d_model), # LSTM layer tl.Mean(axis=1), # Mean over columns tl.Fn('Normalize', normalize) # Apply normalize function ) # Returns one vector of shape [batch_size, d_model]. # Run on s1_tensor and s2_tensor in parallel. model = tl.Parallel(s_processor, s_processor) return model
def FunnelTransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, encoder_segment_lengths=(2, 2, 2), n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu, pool_layer=tl.AvgPool, pool_size=(2,), strides=(2,), separate_cls=True): """Returns a Funnel Encoder. This model performs text categorization: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 2 tensor representing a batch of log-probability distributions over N categories; shape is (batch_size, `n_classes`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Final dimension of the output tensors, representing N-way classification. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. encoder_segment_lengths: Tuple, where each element denotes the number of transformer encoder blocks preceding a funnel transformer block. There is no funnel block after the last sequence of encoder blocks, therefore the total number of blocks in the model is equal to `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. pool_layer: Type of pooling layer used for downsampling in each of the funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`. pool_size: Shape of window that gets reduced to a single vector value. If the layer inputs are :math:`n`-dimensional arrays, then `pool_size` must be a tuple of length :math:`n-2`. strides: Offsets from the location of one window to the locations of neighboring windows along each axis. If specified, must be a tuple of the same length as `pool_size`. If None, then offsets of 1 along each window axis, :math:`(1, ..., 1)`, will be used. separate_cls: If `True`, pooling in funnel blocks is not applied to embeddings of the first token (`cls` from BERT paper) and only final embedding of this token is used for categorization - the rest are discarded. If `False`, each token from the beginning is pooled and all embeddings are averaged and mapped to output categories like in original `TransformerEncoder` model. Returns: A Transformer model that maps strings (conveyed via token IDs) to probability-like activations over a range of output classes. """ assert encoder_segment_lengths positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len)] encoder_blocks = [] n_encoder_segments = len(encoder_segment_lengths) for i in range(n_encoder_segments): # Building i'th segment for _ in range(encoder_segment_lengths[i]): # Create segment_size encoder blocks encoder_blocks.append( _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation)) # If not last segment, add funnel block if i != n_encoder_segments - 1: encoder_blocks.append( _FunnelBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, pool_layer, pool_size, strides, separate_cls)) cls_pooling = SelectFirst() if separate_cls else tl.Mean(axis=1) # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch( positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. cls_pooling, # cls tl.Dense(n_classes), # cls )
def ConfigurableTransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu, ff_dropout=0.1, ff_chunk_size=0, ff_use_sru=0, ff_sparsity=0, ff_sparsity_type='1inN', attention_chunk_size=0, attention_type=tl.Attention, pos_type=None, pos_axial_shape=None, pos_d_axial_embs=None): """Returns a Transformer encoder merged with an N-way categorization head. This model performs text categorization: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 2 tensor representing a batch of log-probability distributions over N categories; shape is (batch_size, `n_classes`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Final dimension of the output tensors, representing N-way classification. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` attention_chunk_size: int, if > 0 run attention chunked at this size attention_type: The attention layer to use for the encoder part. pos_type: string, the type of positional embeddings to use. pos_axial_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. pos_d_axial_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match pos_axial_shape, and values must sum to d_model. Returns: A Transformer model that maps strings (conveyed via token IDs) to probability-like activations over a range of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), PositionalEncoder(mode, dropout, max_len, pos_type, pos_axial_shape, pos_d_axial_embs) ] # pylint: disable=g-complex-comprehension encoder_blocks = [ EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, ff_sparsity_type, attention_chunk_size, attention_type) for i in range(n_layers) ] # pylint: enable=g-complex-comprehension # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. tl.Mean(axis=1), # vecs tl.Dense(n_classes), # vecs )
def TransformerEncoder(vocab_size, n_classes=10, d_model=D_MODEL, d_ff=D_FF, n_layers=N_LAYERS, n_heads=N_HEADS, max_len=MAX_SEQUENCE_LENGTH, dropout=DROPOUT_RATE, dropout_shared_axes=DROPOUT_SHARED_AXES, mode=MODE, ff_activation=FF_ACTIVATION_TYPE): """Returns a Transformer encoder suitable for N-way classification. This model maps tokenized text to N-way (``n_classes``) activations: - input: Array representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length), where sequence_length <= ``max_len``. Array elements are integers in ``range(vocab_size)``, and 0 values mark padding positions. - output: Array representing a batch of raw (non-normalized) activations over ``n_classes`` categories; shape is (batch_size, ``n_classes``). Args: vocab_size: Input vocabulary size -- each element of the input array should be an integer in ``range(vocab_size)``. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Last/innermost dimension of output arrays, suitable for N-way classification. d_model: Last/innermost dimension of activation arrays at most points in the model, including the initial embedding output. d_ff: Last/innermost dimension of special (typically wider) :py:class:`Dense` layer in the feedforward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, layer-norm, feedforward (:py:class:`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within encoder blocks. The same rate is also used for attention dropout in encoder blocks. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (``dropout_shared_axes=(0,1)``) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If ``'train'``, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of :py:class:`Layer`. Returns: A Transformer model that maps strings (conveyed by token IDs) to raw (non-normalized) activations over a range of output classes. """ def _Dropout(): tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) def _EncBlock(): return _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) return tl.Serial( tl.Branch([], tl.PaddingMask()), # Creates masks from copy of the tokens. tl.Embedding(vocab_size, d_model), _Dropout(), tl.PositionalEncoding(max_len=max_len), [_EncBlock() for _ in range(n_layers)], tl.Select([0], n_in=2), # Drops the masks. tl.LayerNorm(), tl.Mean(axis=1), tl.Dense(n_classes), )
def TransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu): """Returns a Transformer encoder merged with an N-way categorization head. This model performs text categorization: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 2 tensor representing a batch of log-probability distributions over N categories; shape is (batch_size, `n_classes`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Final dimension of the output tensors, representing N-way classification. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. Returns: A Transformer model that maps strings (conveyed via token IDs) to probability-like activations over a range of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] encoder_blocks = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_layers) ] # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. tl.Mean(axis=1), # vecs tl.Dense(n_classes), # vecs )
def TransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu): """Returns a Transformer-style encoder. For each item in a batch, this model performs a sequence-to-sequence mapping: - input: sequence of integers, usually token id's from a fixed-size vocabulary -- integers in `range(M)`, where `M` is the vocabulary size. - output: same-length sequence of N-dimensional vectors, where each vector can be interpreted as a log-probability distribution over N discrete categories. Args: vocab_size: "Vocabulary size" -- input integer id's must be in `range(vocab_size)`. Id's typically come from preprocessing text data with a vocabulary-based tokenizer. n_classes: Size/depth of the output vectors, intended for an N-way classification task. d_model: The basic embedding size (vector depth) of the model. This is the vector size used by the initial embedding layer and at many intermediate points in the model. d_ff: Vector depth (typically greater than `d_model`) used in the feed-forward (`Dense`) layer of each encoder block. n_layers: Number of encoder blocks. Each encoder block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. max_len: Maximum symbol length for positional encoding. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: The activation function (layer) at the end of each encoder block. Returns: A Transformer model as a layer that maps from token id's to activations over a set of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] encoder_blocks = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_layers) ] # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. tl.Mean(axis=1), # vecs tl.Dense(n_classes), # vecs tl.LogSoftmax(), # vecs )
def TransformerEncoder(vocab_size=vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu, EncoderBlock=EncoderBlock): """ Returns a Transformer encoder model. The input to the model is a tensor of tokens. Args: vocab_size (int): vocab size. Defaults to vocab_size. n_classes (int): how many classes on output. Defaults to 10. d_model (int): depth of embedding. Defaults to 512. d_ff (int): depth of feed-forward layer. Defaults to 2048. n_layers (int): number of encoder/decoder layers. Defaults to 6. n_heads (int): number of attention heads. Defaults to 8. dropout (float): dropout rate (how much to drop out). Defaults to 0.1. dropout_shared_axes (int): axes on which to share dropout mask. Defaults to None. max_len (int): maximum symbol length for positional encoding. Defaults to 2048. mode (str): 'train' or 'eval'. Defaults to 'train'. ff_activation (function): the non-linearity in feed-forward layer. Defaults to tl.Relu. EncoderBlock (function): Returns the encoder block. Defaults to EncoderBlock. Returns: trax.layers.combinators.Serial: A Transformer model as a layer that maps from a tensor of tokens to activations over a set of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ### # Use the function `EncoderBlock` (implemented above) and pass in the parameters over `n_layers` encoder_blocks = [EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for _ in range(n_layers)] # Assemble and return the model. return tl.Serial( # Encode tl.Branch( # Use `positional_encoder` positional_encoder, # Use trax padding mask tl.PaddingMask(), ), # Use `encoder_blocks` encoder_blocks, # Use select layer tl.Select([0], n_in=2), # Use trax layer normalization tl.LayerNorm(), # Map to output categories. # Use trax mean. set axis to 1 tl.Mean(axis=1), # Use trax Dense using `n_classes` tl.Dense(n_classes), # Use trax log softmax tl.LogSoftmax(), )