コード例 #1
0
def NMTAttn(input_vocab_size=33300,
            target_vocab_size=33300,
            d_model=1024,
            n_encoder_layers=2,
            n_decoder_layers=2,
            n_attention_heads=4,
            attention_dropout=0.0,
            mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
    input_vocab_size: int: vocab size of the input
    target_vocab_size: int: vocab size of the target
    d_model: int:  depth of embedding (n_units in the LSTM cell)
    n_encoder_layers: int: number of LSTM layers in the encoder
    n_decoder_layers: int: number of LSTM layers in the decoder after attention
    n_attention_heads: int: number of attention heads
    attention_dropout: float, dropout for the attention layer
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
    A LSTM sequence-to-sequence model with attention.
    """

    # creation of input encoder for encoder activations
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    # creation of layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    # Model
    model = tl.Serial(

        # copy input tokens and target tokens for later use.
        tl.Select([0, 1, 0, 1]),

       # parellel run of input encoder on the input and pre-attention decoder the target.
        tl.Parallel(input_encoder, pre_attention_decoder),

        # preparation of queries, keys, values and mask for attention.
        tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),

        # AttentionQKV layer nested it inside a Residual layer to add to the pre-attention decoder activations
        tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)),
        tl.Select([0, 2]),

        # run the rest of the RNN decoder
        [tl.LSTM(n_units=d_model) for _ in range(n_decoder_layers)],

        # Dense layer of target size
        tl.Dense(target_vocab_size),

       #Log-softmax for output
        tl.LogSoftmax()
    )

    return model
コード例 #2
0
def NMTAttn(input_vocab_size=33300,
            target_vocab_size=33300,
            d_model=1024,
            n_encoder_layers=2,
            n_decoder_layers=2,
            n_attention_heads=4,
            attention_dropout=0.0,
            mode='train'):

    input_encoder = input_encoder_fn(input_vocab_size, d_model,
                                     n_encoder_layers)
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size,
                                                     d_model)

    model = tl.Serial(
        tl.Select([0, 1, 0, 1]),
        tl.Parallel(input_encoder, pre_attention_decoder),
        tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),

        # nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
        tl.Residual(
            tl.AttentionQKV(d_model,
                            n_heads=n_attention_heads,
                            dropout=attention_dropout,
                            mode=mode)),

        # Step 6: drop attention mask (i.e. index = None
        tl.Select([0, 2]),
        [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
        tl.Dense(target_vocab_size),
        tl.LogSoftmax())
    return model
コード例 #3
0
ファイル: rnn_test.py プロジェクト: yangliuy/trax
 def test_names(self):
     layer = tl.LSTM(3)
     self.assertEqual('LSTM_3', str(layer))
     layer = tl.GRU(5)
     self.assertEqual('GRU_5', str(layer))
     layer = tl.SRU(7)
     self.assertEqual('SRU_7', str(layer))
コード例 #4
0
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'):
    """Returns a Siamese model.

    Args:
        vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab).
        d_model (int, optional): Depth of the model. Defaults to 128.
        mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to 'train'.

    Returns:
        trax.layers.combinators.Parallel: A Siamese model. 
    """
    def normalize(x):  # normalizes the vectors to have L2 norm 1
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))

    ### START CODE HERE (Replace instances of 'None' with your code) ###
    q_processor = tl.Serial(  # Processor will run on Q1 and Q2.
        tl.Embedding(vocab_size=vocab_size,
                     d_feature=d_model),  # Embedding layer
        tl.LSTM(n_units=d_model),  # LSTM layer
        tl.Mean(axis=1),  # Mean over columns
        tl.Fn('Normalize', lambda x: normalize(x))  # Apply normalize function
    )  # Returns one vector of shape [batch_size, d_model].

    ### END CODE HERE ###

    # Run on Q1 and Q2 in parallel.
    model = tl.Parallel(q_processor, q_processor)
    return model
コード例 #5
0
 def test_names(self, backend):
   with fastmath.use_backend(backend):
     layer = tl.LSTM(3)
     self.assertEqual('LSTM_3', str(layer))
     layer = tl.GRU(5)
     self.assertEqual('GRU_5', str(layer))
     layer = tl.SRU(7)
     self.assertEqual('SRU_7', str(layer))
コード例 #6
0
def NER(vocab_size=35181, d_model=50, tags=tag_map):
    '''
      Input: 
        vocab_size - integer containing the size of the vocabulary
        d_model - integer describing the embedding size
      Output:
        model - a trax serial model
    '''
    model = tl.Serial(
        tl.Embedding(vocab_size, d_model),  # Embedding layer
        tl.LSTM(d_model),  # LSTM layer
        tl.Dense(len(tags)),  # Dense layer with len(tags) units
        tl.LogSoftmax()  # LogSoftmax layer
    )
    return model
コード例 #7
0
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):
    """
    convert tokenize sentence into encoder activations gives keys and value for attention
    Args:
        input_vocab_size: int: vocab size of the input
        d_model: int:  dimention of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
    Returns:
        tl.Serial: The input encoder
    """
    input_encoder = tl.Serial(
        # create an embedding layer to convert tokens to vectors
        tl.Embedding(vocab_size=input_vocab_size, d_feature=d_model),  # (B,input_vocab_size) -> (B, d_model)

        # feed the embeddings to the LSTM layers. It is a stack of n_encoder_layers LSTM layers
        [tl.LSTM(n_units=d_model) for _ in range(n_encoder_layers)]
    )
    return input_encoder
コード例 #8
0
def NER(vocab_size=35181, d_model=50, tags=tag_map):
    '''
      Input: 
        vocab_size - integer containing the size of the vocabulary
        d_model - integer describing the embedding size
      Output:
        model - a trax serial model
    '''
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    model = tl.Serial(
        tl.Embedding(vocab_size=vocab_size,
                     d_feature=d_model),  # Embedding layer
        tl.LSTM(n_units=d_model),  # LSTM layer
        tl.Dense(n_units=len(tags)),  # Dense layer with len(tags) units
        tl.LogSoftmax()  # LogSoftmax layer
    )
    ### END CODE HERE ###
    return model
コード例 #9
0
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    """
    Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.
    Args:
        mode: str: 'train' or 'eval'
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
    Returns:
        tl.Serial: The pre-attention decoder
    """
    pre_attention_decoder = tl.Serial(

        # shift right to insert start-of-sentence token and implement
        # teacher forcing during training
        tl.ShiftRight(mode=mode),

        # run an embedding layer to convert tokens to vectors
        tl.Embedding(vocab_size=target_vocab_size, d_feature=d_model),

        # feed to an LSTM layer
        tl.LSTM(n_units=d_model)
    )
    return pre_attention_decoder
コード例 #10
0
def siamese(vocab_size, d_model=128):
    """Returns a Siamese model.

    Args:
        vocab_size (int, optional): Length of the vocabulary. Defaults to
                                    len(vocab).
        d_model (int, optional): Depth of the model. Defaults to 128.

    Returns:
        trax.layers.combinators.Parallel: A Siamese model.
    """
    def normalize(vec):  # normalizes the vectors to have L2 norm 1
        return vec / fastnp.sqrt(fastnp.sum(vec * vec, axis=-1, keepdims=True))

    s_processor = tl.Serial(
        tl.Embedding(vocab_size, d_model),  # Embedding layer
        tl.LSTM(d_model),  # LSTM layer
        tl.Mean(axis=1),  # Mean over columns
        tl.Fn('Normalize', normalize)  # Apply normalize function
    )  # Returns one vector of shape [batch_size, d_model].

    # Run on s1_tensor and s2_tensor in parallel.
    model = tl.Parallel(s_processor, s_processor)
    return model
コード例 #11
0
ファイル: rnn.py プロジェクト: yaoshuyin/trax
def LSTMSeq2SeqAttn(input_vocab_size=256,
                    target_vocab_size=256,
                    d_model=512,
                    n_encoder_layers=2,
                    n_decoder_layers=2,
                    n_attention_heads=1,
                    attention_dropout=0.0,
                    mode='train'):
  """Returns an LSTM sequence-to-sequence model with attention.

  This model is an encoder-decoder that performs tokenized string-to-string
  ("source"-to-"target") transduction:

    - inputs (2):

        - source: rank 2 tensor representing a batch of text strings via token
          IDs plus padding markers; shape is (batch_size, sequence_length). The
          tensor elements are integers in `range(input_vocab_size)`, and `0`
          values mark padding positions.

        - target: rank 2 tensor representing a batch of text strings via token
          IDs plus padding markers; shape is (batch_size, sequence_length). The
          tensor elements are integers in `range(output_vocab_size)`, and `0`
          values mark padding positions.

    - output: rank 3 tensor representing a batch of log-probability
      distributions for each sequence position over possible token IDs;
      shape is (batch_size, sequence_length, `vocab_size`).

  An example use would be to translate (tokenized) sentences from English to
  German.

  The model works as follows:

  * Input encoder runs on the input tokens and creates activations that
    are used as both keys and values in attention.
  * Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.
  * Attention runs on the queries, keys and values masking out input padding.
  * Decoder runs on the result, followed by a cross-entropy loss.

  Args:
    input_vocab_size: Input vocabulary size -- each element of the input tensor
        should be an integer in `range(vocab_size)`. These integers typically
        represent token IDs from a vocabulary-based tokenizer.
    target_vocab_size: Target vocabulary size.
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    n_encoder_layers: Number of LSTM layers in the encoder.
    n_decoder_layers: Number of LSTM layers in the decoder after attention.
    n_attention_heads: Number of attention heads.
    attention_dropout: Stochastic rate (probability) for dropping an activation
        value when applying dropout within an attention block.
    mode: If `'predict'`, use fast inference. If `'train'`, each attention block
        will include dropout; else, it will pass all values through unaltered.

  Returns:
    An LSTM sequence-to-sequence model as a layer that maps from a
    source-target tokenized text pair to activations over a vocab set.
  """
  input_encoder = tl.Serial(
      tl.Embedding(input_vocab_size, d_model),
      [tl.LSTM(d_model) for _ in range(n_encoder_layers)],
  )

  pre_attention_decoder = tl.Serial(
      tl.ShiftRight(mode=mode),
      tl.Embedding(target_vocab_size, d_model),
      tl.LSTM(d_model),
  )

  def PrepareAttentionInputs():
    """Layer that prepares queries, keys, values and mask for attention."""
    def F(encoder_activations, decoder_activations, input_tokens):
      keys = values = encoder_activations
      queries = decoder_activations
      # Mask is 1 where inputs are not padding (0) and 0 where they are padding.
      mask = (input_tokens != 0)
      # We need to add axes to the mask for attention heads and decoder length.
      mask = jnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
      # Broadcast so mask is [batch, 1 for heads, decoder-len, encoder-len].
      mask = mask + jnp.zeros((1, 1, decoder_activations.shape[1], 1))
      mask = mask.astype(jnp.float32)
      return queries, keys, values, mask
    return tl.Fn('PrepareAttentionInputs', F, n_out=4)

  return tl.Serial(              # in-toks, target-toks
      tl.Select([0, 1, 0, 1]),   # in-toks, target-toks, in-toks, target-toks
      tl.Parallel(input_encoder, pre_attention_decoder),
      PrepareAttentionInputs(),  # q, k, v, mask, target-toks
      tl.Residual(
          tl.AttentionQKV(d_model, n_heads=n_attention_heads,
                          dropout=attention_dropout, mode=mode)
      ),                         # decoder-vecs, mask, target-toks
      tl.Select([0, 2]),         # decoder-vecs, target-toks
      [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
      tl.Dense(target_vocab_size),
      tl.LogSoftmax()
  )
コード例 #12
0
ファイル: rnn.py プロジェクト: yangliuy/trax
def LSTMSeq2SeqAttn(input_vocab_size=256,
                    target_vocab_size=256,
                    d_model=512,
                    n_encoder_layers=2,
                    n_decoder_layers=2,
                    n_attention_heads=1,
                    attention_dropout=0.0,
                    mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

  The input to the model is a pair (input tokens, target tokens), e.g.,
  an English sentence (tokenized) and its translation into German (tokenized).

  The model works as follows:

  * Input encoder runs on the input tokens and creates activations that
    are used as both keys and values in attention.
  * Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.
  * Attention runs on the queries, keys and values masking out input padding.
  * Decoder runs on the result, followed by a cross-entropy loss.

  Args:
    input_vocab_size: int: vocab size of the input
    target_vocab_size: int: vocab size of the target
    d_model: int:  depth of embedding (n_units in the LSTM cell)
    n_encoder_layers: int: number of LSTM layers in the encoder
    n_decoder_layers: int: number of LSTM layers in the decoder after attention
    n_attention_heads: int: number of attention heads
    attention_dropout: float, dropout for the attention layer
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

  Returns:
    An LSTM sequence-to-sequence model with attention.
  """
    input_encoder = tl.Serial(
        tl.Embedding(input_vocab_size, d_model),
        [tl.LSTM(d_model) for _ in range(n_encoder_layers)],
    )

    pre_attention_decoder = tl.Serial(
        tl.ShiftRight(mode=mode),
        tl.Embedding(target_vocab_size, d_model),
        tl.LSTM(d_model),
    )

    def PrepareAttentionInputs():
        """Layer that prepares queries, keys, values and mask for attention."""
        def F(encoder_activations, decoder_activations, input_tokens):
            keys = values = encoder_activations
            queries = decoder_activations
            # Mask is 1 where inputs are not padding (0) and 0 where they are padding.
            mask = (input_tokens != 0)
            # We need to add axes to the mask for attention heads and decoder length.
            mask = jnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
            # Broadcast so mask is [batch, 1 for heads, decoder-len, encoder-len].
            mask = mask + jnp.zeros((1, 1, decoder_activations.shape[1], 1))
            return queries, keys, values, mask

        return tl.Fn('PrepareAttentionInputs', F, n_out=4)

    return tl.Serial(  # in-toks, target-toks
        tl.Select([0, 1, 0, 1]),  # in-toks, target-toks, in-toks, target-toks
        tl.Parallel(input_encoder, pre_attention_decoder),
        PrepareAttentionInputs(),  # q, k, v, mask, target-toks
        tl.Residual(
            tl.AttentionQKV(d_model,
                            n_heads=n_attention_heads,
                            dropout=attention_dropout,
                            mode=mode)),  # decoder-vecs, mask, target-toks
        tl.Select([0, 2]),  # decoder-vecs, target-toks
        [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
        tl.Dense(target_vocab_size),
        tl.LogSoftmax())
コード例 #13
0
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):
    input_encoder = tl.Serial(
        tl.Embedding(input_vocab_size, d_model),
        [tl.LSTM(d_model) for _ in range(n_encoder_layers)])
    return input_encoder
コード例 #14
0
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    pre_attention_decoder = tl.Serial(tl.ShiftRight(mode=mode),
                                      tl.Embedding(target_vocab_size, d_model),
                                      tl.LSTM(d_model))
    return pre_attention_decoder