Exemple #1
0
def DecoderBlock(d_model, d_ff, n_heads, dropout, mode):
  """Returns a layer sequence that implements a Transformer decoder block.

  The input to the layer sequence is an activation tensor.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    A sequence of layers that maps an activation tensor to an activation tensor.
  """
  self_attention = [
      tl.LayerNorm(),  # vec
      tl.BasicCausalAttention(
          d_model, n_heads=n_heads, dropout=dropout, mode=mode),
      tl.Dropout(rate=dropout, mode=mode),  # vec
  ]
  feed_forward = [
      FeedForward(d_model, d_ff, dropout, mode=mode),
  ]
  return [
      tl.Residual(self_attention),
      tl.Residual(feed_forward),
  ]
def EncoderDecoder(d_model, d_ff, n_heads, dropout, layer_idx, mode):
    """Transformer encoder-decoder layer.

  The input is a triple (decoder_input, mask, encoder) where the mask is
  created from the original source to prevent attending to the padding part
  of the encoder.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    layer_idx: which layer are we at (for bookkeeping)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    decoder_self_attention = [  #        vecs_d   pmask vecs_e
        tl.LayerNorm(),  #        vecs_d   ..... ......
        tl.BasicCausalAttention(d_model,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Dropout(rate=dropout, mode=mode),  # vecs_d          ..... ......
    ]
    decoder_to_encoder_attention = [  # vecs_d        masks         vecs_e
        tl.LayerNorm(),  # vecs_d        masks         vecs_e
        tl.Parallel([], [], tl.Dup()),  # ______        _____  vecs_e vecs_e
        tl.Parallel([], tl.Swap()),  # ______        vecs_e masks  ......
        tl.Parallel([], tl.Dup()),  # ______ vecs_e vecs_e .....  ......
        tl.AttentionQKV(  # (q k v masks ... --> vecs_d masks ...)
            d_model,
            n_heads=n_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),  # vecs_d mask vecs_e
    ]
    feed_forward = [
        FeedForward(d_model, d_ff, dropout, layer_idx=layer_idx, mode=mode),
    ]
    return [  # vecs_d masks vecs_e
        tl.Residual(decoder_self_attention),  # vecs_d masks vecs_e
        tl.Residual(decoder_to_encoder_attention),  # vecs_d masks vecs_e
        tl.Residual(feed_forward),  # vecs_d masks vecs_e
    ]