def NMTAttn(input_vocab_size=33300, target_vocab_size=33300, d_model=1024, n_encoder_layers=2, n_decoder_layers=2, n_attention_heads=4, attention_dropout=0.0, mode='train'): """Returns an LSTM sequence-to-sequence model with attention. The input to the model is a pair (input tokens, target tokens), e.g., an English sentence (tokenized) and its translation into German (tokenized). Args: input_vocab_size: int: vocab size of the input target_vocab_size: int: vocab size of the target d_model: int: depth of embedding (n_units in the LSTM cell) n_encoder_layers: int: number of LSTM layers in the encoder n_decoder_layers: int: number of LSTM layers in the decoder after attention n_attention_heads: int: number of attention heads attention_dropout: float, dropout for the attention layer mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference Returns: A LSTM sequence-to-sequence model with attention. """ # creation of input encoder for encoder activations input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers) # creation of layers for the pre-attention decoder pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model) # Model model = tl.Serial( # copy input tokens and target tokens for later use. tl.Select([0, 1, 0, 1]), # parellel run of input encoder on the input and pre-attention decoder the target. tl.Parallel(input_encoder, pre_attention_decoder), # preparation of queries, keys, values and mask for attention. tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4), # AttentionQKV layer nested it inside a Residual layer to add to the pre-attention decoder activations tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)), tl.Select([0, 2]), # run the rest of the RNN decoder [tl.LSTM(n_units=d_model) for _ in range(n_decoder_layers)], # Dense layer of target size tl.Dense(target_vocab_size), #Log-softmax for output tl.LogSoftmax() ) return model
def NMTAttn(input_vocab_size=33300, target_vocab_size=33300, d_model=1024, n_encoder_layers=2, n_decoder_layers=2, n_attention_heads=4, attention_dropout=0.0, mode='train'): input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers) pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model) model = tl.Serial( tl.Select([0, 1, 0, 1]), tl.Parallel(input_encoder, pre_attention_decoder), tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4), # nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries) tl.Residual( tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)), # Step 6: drop attention mask (i.e. index = None tl.Select([0, 2]), [tl.LSTM(d_model) for _ in range(n_decoder_layers)], tl.Dense(target_vocab_size), tl.LogSoftmax()) return model
def test_names(self): layer = tl.LSTM(3) self.assertEqual('LSTM_3', str(layer)) layer = tl.GRU(5) self.assertEqual('GRU_5', str(layer)) layer = tl.SRU(7) self.assertEqual('SRU_7', str(layer))
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'): """Returns a Siamese model. Args: vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab). d_model (int, optional): Depth of the model. Defaults to 128. mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to 'train'. Returns: trax.layers.combinators.Parallel: A Siamese model. """ def normalize(x): # normalizes the vectors to have L2 norm 1 return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True)) ### START CODE HERE (Replace instances of 'None' with your code) ### q_processor = tl.Serial( # Processor will run on Q1 and Q2. tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Embedding layer tl.LSTM(n_units=d_model), # LSTM layer tl.Mean(axis=1), # Mean over columns tl.Fn('Normalize', lambda x: normalize(x)) # Apply normalize function ) # Returns one vector of shape [batch_size, d_model]. ### END CODE HERE ### # Run on Q1 and Q2 in parallel. model = tl.Parallel(q_processor, q_processor) return model
def test_names(self, backend): with fastmath.use_backend(backend): layer = tl.LSTM(3) self.assertEqual('LSTM_3', str(layer)) layer = tl.GRU(5) self.assertEqual('GRU_5', str(layer)) layer = tl.SRU(7) self.assertEqual('SRU_7', str(layer))
def NER(vocab_size=35181, d_model=50, tags=tag_map): ''' Input: vocab_size - integer containing the size of the vocabulary d_model - integer describing the embedding size Output: model - a trax serial model ''' model = tl.Serial( tl.Embedding(vocab_size, d_model), # Embedding layer tl.LSTM(d_model), # LSTM layer tl.Dense(len(tags)), # Dense layer with len(tags) units tl.LogSoftmax() # LogSoftmax layer ) return model
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers): """ convert tokenize sentence into encoder activations gives keys and value for attention Args: input_vocab_size: int: vocab size of the input d_model: int: dimention of embedding (n_units in the LSTM cell) n_encoder_layers: int: number of LSTM layers in the encoder Returns: tl.Serial: The input encoder """ input_encoder = tl.Serial( # create an embedding layer to convert tokens to vectors tl.Embedding(vocab_size=input_vocab_size, d_feature=d_model), # (B,input_vocab_size) -> (B, d_model) # feed the embeddings to the LSTM layers. It is a stack of n_encoder_layers LSTM layers [tl.LSTM(n_units=d_model) for _ in range(n_encoder_layers)] ) return input_encoder
def NER(vocab_size=35181, d_model=50, tags=tag_map): ''' Input: vocab_size - integer containing the size of the vocabulary d_model - integer describing the embedding size Output: model - a trax serial model ''' ### START CODE HERE (Replace instances of 'None' with your code) ### model = tl.Serial( tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Embedding layer tl.LSTM(n_units=d_model), # LSTM layer tl.Dense(n_units=len(tags)), # Dense layer with len(tags) units tl.LogSoftmax() # LogSoftmax layer ) ### END CODE HERE ### return model
def pre_attention_decoder_fn(mode, target_vocab_size, d_model): """ Pre-attention decoder runs on the targets and creates activations that are used as queries in attention. Args: mode: str: 'train' or 'eval' target_vocab_size: int: vocab size of the target d_model: int: depth of embedding (n_units in the LSTM cell) Returns: tl.Serial: The pre-attention decoder """ pre_attention_decoder = tl.Serial( # shift right to insert start-of-sentence token and implement # teacher forcing during training tl.ShiftRight(mode=mode), # run an embedding layer to convert tokens to vectors tl.Embedding(vocab_size=target_vocab_size, d_feature=d_model), # feed to an LSTM layer tl.LSTM(n_units=d_model) ) return pre_attention_decoder
def siamese(vocab_size, d_model=128): """Returns a Siamese model. Args: vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab). d_model (int, optional): Depth of the model. Defaults to 128. Returns: trax.layers.combinators.Parallel: A Siamese model. """ def normalize(vec): # normalizes the vectors to have L2 norm 1 return vec / fastnp.sqrt(fastnp.sum(vec * vec, axis=-1, keepdims=True)) s_processor = tl.Serial( tl.Embedding(vocab_size, d_model), # Embedding layer tl.LSTM(d_model), # LSTM layer tl.Mean(axis=1), # Mean over columns tl.Fn('Normalize', normalize) # Apply normalize function ) # Returns one vector of shape [batch_size, d_model]. # Run on s1_tensor and s2_tensor in parallel. model = tl.Parallel(s_processor, s_processor) return model
def LSTMSeq2SeqAttn(input_vocab_size=256, target_vocab_size=256, d_model=512, n_encoder_layers=2, n_decoder_layers=2, n_attention_heads=1, attention_dropout=0.0, mode='train'): """Returns an LSTM sequence-to-sequence model with attention. This model is an encoder-decoder that performs tokenized string-to-string ("source"-to-"target") transduction: - inputs (2): - source: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(input_vocab_size)`, and `0` values mark padding positions. - target: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(output_vocab_size)`, and `0` values mark padding positions. - output: rank 3 tensor representing a batch of log-probability distributions for each sequence position over possible token IDs; shape is (batch_size, sequence_length, `vocab_size`). An example use would be to translate (tokenized) sentences from English to German. The model works as follows: * Input encoder runs on the input tokens and creates activations that are used as both keys and values in attention. * Pre-attention decoder runs on the targets and creates activations that are used as queries in attention. * Attention runs on the queries, keys and values masking out input padding. * Decoder runs on the result, followed by a cross-entropy loss. Args: input_vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. target_vocab_size: Target vocabulary size. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. n_encoder_layers: Number of LSTM layers in the encoder. n_decoder_layers: Number of LSTM layers in the decoder after attention. n_attention_heads: Number of attention heads. attention_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an attention block. mode: If `'predict'`, use fast inference. If `'train'`, each attention block will include dropout; else, it will pass all values through unaltered. Returns: An LSTM sequence-to-sequence model as a layer that maps from a source-target tokenized text pair to activations over a vocab set. """ input_encoder = tl.Serial( tl.Embedding(input_vocab_size, d_model), [tl.LSTM(d_model) for _ in range(n_encoder_layers)], ) pre_attention_decoder = tl.Serial( tl.ShiftRight(mode=mode), tl.Embedding(target_vocab_size, d_model), tl.LSTM(d_model), ) def PrepareAttentionInputs(): """Layer that prepares queries, keys, values and mask for attention.""" def F(encoder_activations, decoder_activations, input_tokens): keys = values = encoder_activations queries = decoder_activations # Mask is 1 where inputs are not padding (0) and 0 where they are padding. mask = (input_tokens != 0) # We need to add axes to the mask for attention heads and decoder length. mask = jnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1])) # Broadcast so mask is [batch, 1 for heads, decoder-len, encoder-len]. mask = mask + jnp.zeros((1, 1, decoder_activations.shape[1], 1)) mask = mask.astype(jnp.float32) return queries, keys, values, mask return tl.Fn('PrepareAttentionInputs', F, n_out=4) return tl.Serial( # in-toks, target-toks tl.Select([0, 1, 0, 1]), # in-toks, target-toks, in-toks, target-toks tl.Parallel(input_encoder, pre_attention_decoder), PrepareAttentionInputs(), # q, k, v, mask, target-toks tl.Residual( tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode) ), # decoder-vecs, mask, target-toks tl.Select([0, 2]), # decoder-vecs, target-toks [tl.LSTM(d_model) for _ in range(n_decoder_layers)], tl.Dense(target_vocab_size), tl.LogSoftmax() )
def LSTMSeq2SeqAttn(input_vocab_size=256, target_vocab_size=256, d_model=512, n_encoder_layers=2, n_decoder_layers=2, n_attention_heads=1, attention_dropout=0.0, mode='train'): """Returns an LSTM sequence-to-sequence model with attention. The input to the model is a pair (input tokens, target tokens), e.g., an English sentence (tokenized) and its translation into German (tokenized). The model works as follows: * Input encoder runs on the input tokens and creates activations that are used as both keys and values in attention. * Pre-attention decoder runs on the targets and creates activations that are used as queries in attention. * Attention runs on the queries, keys and values masking out input padding. * Decoder runs on the result, followed by a cross-entropy loss. Args: input_vocab_size: int: vocab size of the input target_vocab_size: int: vocab size of the target d_model: int: depth of embedding (n_units in the LSTM cell) n_encoder_layers: int: number of LSTM layers in the encoder n_decoder_layers: int: number of LSTM layers in the decoder after attention n_attention_heads: int: number of attention heads attention_dropout: float, dropout for the attention layer mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference Returns: An LSTM sequence-to-sequence model with attention. """ input_encoder = tl.Serial( tl.Embedding(input_vocab_size, d_model), [tl.LSTM(d_model) for _ in range(n_encoder_layers)], ) pre_attention_decoder = tl.Serial( tl.ShiftRight(mode=mode), tl.Embedding(target_vocab_size, d_model), tl.LSTM(d_model), ) def PrepareAttentionInputs(): """Layer that prepares queries, keys, values and mask for attention.""" def F(encoder_activations, decoder_activations, input_tokens): keys = values = encoder_activations queries = decoder_activations # Mask is 1 where inputs are not padding (0) and 0 where they are padding. mask = (input_tokens != 0) # We need to add axes to the mask for attention heads and decoder length. mask = jnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1])) # Broadcast so mask is [batch, 1 for heads, decoder-len, encoder-len]. mask = mask + jnp.zeros((1, 1, decoder_activations.shape[1], 1)) return queries, keys, values, mask return tl.Fn('PrepareAttentionInputs', F, n_out=4) return tl.Serial( # in-toks, target-toks tl.Select([0, 1, 0, 1]), # in-toks, target-toks, in-toks, target-toks tl.Parallel(input_encoder, pre_attention_decoder), PrepareAttentionInputs(), # q, k, v, mask, target-toks tl.Residual( tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)), # decoder-vecs, mask, target-toks tl.Select([0, 2]), # decoder-vecs, target-toks [tl.LSTM(d_model) for _ in range(n_decoder_layers)], tl.Dense(target_vocab_size), tl.LogSoftmax())
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers): input_encoder = tl.Serial( tl.Embedding(input_vocab_size, d_model), [tl.LSTM(d_model) for _ in range(n_encoder_layers)]) return input_encoder
def pre_attention_decoder_fn(mode, target_vocab_size, d_model): pre_attention_decoder = tl.Serial(tl.ShiftRight(mode=mode), tl.Embedding(target_vocab_size, d_model), tl.LSTM(d_model)) return pre_attention_decoder