Example #1
0
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode):
    """Returns a layer sequence that implements a Transformer decoder block.

  The input to the layer sequence is an activation tensor.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    A sequence of layers that maps an activation tensor to an activation tensor.
  """
    self_attention = [
        tl.LayerNorm(),  # vec
        tl.Dup(),  # vec vec
        tl.Parallel([], tl.CausalMask(axis=-2)),  # vec mask
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Parallel([], tl.Drop()),  # vec
        tl.Dropout(rate=dropout, mode=mode),  # vec
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        tl.Residual(self_attention),
        tl.Residual(feed_forward),
    ]
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    return layers.Serial(
        layers.Residual(  # Self-attention block.
            layers.LayerNorm(),
            layers.Branch(),
            layers.Parallel(
                layers.Identity(),  # activation for (q, k, v)
                layers.CausalMask(axis=-2)),  # attention mask
            layers.MultiHeadedAttention(feature_depth,
                                        num_heads=num_heads,
                                        dropout=dropout,
                                        mode=mode),
            layers.Dropout(rate=dropout, mode=mode)),
        ResidualFeedForward(feature_depth,
                            feedforward_depth,
                            dropout,
                            mode=mode))
Example #3
0
def DecoderLayer(positions, d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    positions: random vectors for positions
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    return [
        tl.Residual(  # Self-attention block.
            PreservePosition(tl.LayerNorm()),
            tl.Dup(),
            tl.Parallel(
                [],  # activation for (q, k, v)
                tl.CausalMask(axis=-2)),  # attention mask
            MultiHeadedAttentionPosition(positions,
                                         d_feature,
                                         n_heads=n_heads,
                                         dropout=dropout,
                                         mode=mode),
            PreservePosition(tl.Dropout(rate=dropout, mode=mode))),
        ResidualFeedForward(d_feature, d_feedforward, dropout, mode=mode)
    ]
Example #4
0
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    return tl.Serial(
        tl.Residual(  # Self-attention block.
            tl.LayerNorm(),
            tl.Branch(tl.Copy(), tl.CausalMask(axis=-2)),  # Create mask.
            tl.MultiHeadedAttention(feature_depth,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    mode=mode),
            tl.Select(0),  # Drop the mask.
            tl.Dropout(rate=dropout, mode=mode)),
        ResidualFeedForward(feature_depth,
                            feedforward_depth,
                            dropout,
                            mode=mode))
Example #5
0
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    self_attention = [
        tl.LayerNorm(),
        tl.Branch([], tl.CausalMask(axis=-2)),  # Create mask.
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Select(0),  # Drop mask.
        tl.Dropout(rate=dropout, mode=mode),
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        tl.Residual(self_attention),
        tl.Residual(feed_forward),
    ]
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                        mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (encoder, mask, decoder_input) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (encoder, mask, decoder_activations).
  """
    # Decoder self-attending to decoder.
    self_attention = layers.Residual(
        layers.LayerNorm(),
        layers.Branch(),
        layers.Parallel(
            layers.Identity(),  # activation for (q, k, v)
            layers.CausalMask(axis=-2)),  # attention mask
        layers.MultiHeadedAttention(feature_depth,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    mode=mode),
        layers.Dropout(rate=dropout, mode=mode))
    # Decoder attending to encoder.
    encoder_decoder_attention = layers.Serial(
        layers.Reorder(output=((2, 0, 0), 1)),  # ((dec, enc, enc), mask)
        layers.MultiHeadedAttentionQKV(  # ((q, k, v), mask) --> new v
            feature_depth,
            num_heads=num_heads,
            dropout=dropout,
            mode=mode),
        layers.Dropout(rate=dropout, mode=mode),
    )
    return layers.Serial(
        layers.Parallel(layers.Identity(), layers.Identity(), self_attention),
        layers.Branch(),
        layers.Parallel(layers.Identity(), encoder_decoder_attention),
        layers.UnnestBranches(),  # (encoder, mask, old_act, new_act)
        layers.Reorder(output=(0, 1, (2, 3))),
        layers.Parallel(  # Residual after encoder-decoder attention.
            layers.Identity(), layers.Identity(), layers.SumBranches()),
        layers.Parallel(  # Feed-forward on the third component (decoder).
            layers.Identity(), layers.Identity(),
            ResidualFeedForward(feature_depth,
                                feedforward_depth,
                                dropout,
                                mode=mode)))
Example #7
0
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                        mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (encoder, mask, decoder_input) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (encoder, mask, decoder_activations).
  """
    # Decoder self-attending to decoder.
    self_attention = tl.Residual(
        tl.LayerNorm(),
        tl.Branch(tl.NoOp(), tl.CausalMask(axis=-2)),  # create mask
        tl.MultiHeadedAttention(feature_depth,
                                num_heads=num_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Select(0),  # drop mask
        tl.Dropout(rate=dropout, mode=mode))
    # Decoder attending to encoder.
    encoder_decoder_attention = tl.Serial(
        tl.Select((2, 0, 0, 1)),  # (dec, enc, enc, mask)
        tl.MultiHeadedAttentionQKV(  # (q, k, v, mask) --> new, mask
            feature_depth,
            num_heads=num_heads,
            dropout=dropout,
            mode=mode),
        tl.Select(0),  # drop the mask
        tl.Dropout(rate=dropout, mode=mode),
    )
    return tl.Serial(
        tl.Parallel(tl.NoOp(), tl.NoOp(), self_attention),
        tl.Branch(tl.NoOp(), encoder_decoder_attention),
        tl.Select(inputs=(('encoder', 'mask', 'old_act'), 'new_act'),
                  output=('encoder', 'mask', ('old_act', 'new_act'))),
        tl.Parallel(  # Residual after encoder-decoder attention.
            tl.NoOp(), tl.NoOp(), tl.Add()),
        tl.Parallel(  # Feed-forward on the third component (decoder).
            tl.NoOp(), tl.NoOp(),
            ResidualFeedForward(feature_depth,
                                feedforward_depth,
                                dropout,
                                mode=mode)))
Example #8
0
def EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (decoder_input, mask, encoder) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    decoder_self_attention = [
        # TODO(jonni): Work on combinators so that this flow is cleaner/clearer.
        tl.LayerNorm(),
        tl.Dup(),
        tl.CausalMask(axis=-2),  # Create the self-attention mask.
        tl.Swap(),  # Put mask behind the activations.
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Swap(),  # Put self-attention mask on top.
        tl.Drop(),  # Drop self-attention mask.
        tl.Dropout(rate=dropout, mode=mode),
    ]
    decoder_to_encoder_attention = [
        tl.Select((0, 2, 2, 1, 2)),  # (dec, enc, enc, mask, enc-copy)
        tl.
        MultiHeadedAttentionQKV(  # (q, k, v, mask, ...) --> (new, mask, ...)
            d_feature,
            n_heads=n_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        tl.Residual(decoder_self_attention),
        tl.Residual(decoder_to_encoder_attention),
        tl.Residual(feed_forward),
    ]
Example #9
0
def EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer encoder-decoder layer.

  The input is a triple (decoder_input, mask, encoder) where the mask is
  created from the original source to prevent attending to the padding part
  of the encoder.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    decoder_self_attention = [  #        vecs_d   pmask vecs_e
        tl.LayerNorm(),  #        vecs_d   ..... ......
        tl.Dup(),  # vecs_d vecs_d   ..... ......
        tl.Parallel([],
                    tl.CausalMask(axis=-2)),  # ______ masks    ..... ......
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Parallel([], tl.Drop()),  # ______   0      ..... ......
        tl.Dropout(rate=dropout, mode=mode),  # vecs_d          ..... ......
    ]
    decoder_to_encoder_attention = [  # vecs_d        masks         vecs_e
        tl.Parallel([], [], tl.Dup()),  # ______        _____  vecs_e vecs_e
        tl.Parallel([], tl.Swap()),  # ______        vecs_e masks  ......
        tl.Parallel([], tl.Dup()),  # ______ vecs_e vecs_e .....  ......
        tl.MultiHeadedAttentionQKV(  # (q k v masks ... --> vecs_d masks ...)
            d_feature,
            n_heads=n_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),  # vecs_d mask vecs_e
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [  # vecs_d masks vecs_e
        tl.Residual(decoder_self_attention),  # vecs_d masks vecs_e
        tl.Residual(decoder_to_encoder_attention),  # vecs_d masks vecs_e
        tl.Residual(feed_forward),  # vecs_d masks vecs_e
    ]
Example #10
0
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                        mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (decoder_input, mask, encoder) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    # Decoder self-attending to decoder.
    self_attention = tl.Residual(
        tl.LayerNorm(),
        tl.Dup(),
        tl.CausalMask(axis=-2),  # Create the self-attention mask.
        tl.Swap(),  # Put mask behind the activations.
        tl.MultiHeadedAttention(feature_depth,
                                num_heads=num_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Swap(),  # Put self-attention mask on top.
        tl.Drop(),  # Drop self-attention mask.
        tl.Dropout(rate=dropout, mode=mode))
    # Decoder attending to encoder.
    encoder_decoder_attention = tl.Serial(
        tl.Select((0, 2, 2, 1, 2)),  # (dec, enc, enc, mask, enc-copy)
        tl.
        MultiHeadedAttentionQKV(  # (q, k, v, mask, ...) --> (new, mask, ...)
            feature_depth,
            num_heads=num_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),
    )
    return tl.Serial(
        self_attention, tl.Residual(encoder_decoder_attention),
        ResidualFeedForward(feature_depth,
                            feedforward_depth,
                            dropout,
                            mode=mode))
def ChunkedCausalMultiHeadedAttention(d_feature,
                                      n_heads=8,
                                      dropout=0.0,
                                      chunk_selector=None,
                                      mode='train'):
    """Transformer-style causal multi-headed attention operating on chunks.

  Accepts inputs that are a list of chunks and applies causal attention.

  Args:
    d_feature: int:  depth of embedding
    n_heads: int: number of attention heads
    dropout: float: dropout rate
    chunk_selector: a function from chunk number to list of chunks to attend.
    mode: str: 'train' or 'eval'

  Returns:
    Multi-headed self-attention layer.
  """
    prepare_attention_input = tl.Serial(
        tl.Branch(
            tl.Branch(  # q = k = v = first input
                tl.NoOp(), tl.NoOp(), tl.NoOp()),
            tl.CausalMask(axis=-2),
        ),
        tl.Parallel(
            tl.Parallel(
                tl.Dense(d_feature),
                tl.Dense(d_feature),
                tl.Dense(d_feature),
            ), tl.NoOp()))
    return tl.Serial(
        tl.Map(prepare_attention_input),
        ChunkedAttentionSelector(selector=chunk_selector),  # pylint: disable=no-value-for-parameter
        tl.Map(tl.PureMultiHeadedAttention(d_feature=d_feature,
                                           n_heads=n_heads,
                                           dropout=dropout,
                                           mode=mode),
               check_shapes=False),
        tl.Map(tl.Select(0), check_shapes=False),  # drop masks
        tl.Map(tl.Dense(d_feature)))
def DecoderBlock(d_feature, d_feedforward, n_heads, n_attention_chunks,
                 dropout, mode):
    """Reversible transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    n_attention_chunks: int: number of chunks for memory-efficient attention
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    self_attention = [
        tl.LayerNorm(),
        tl.Branch([], tl.CausalMask(axis=-2)),  # Create mask.
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Select(0),  # Drop mask.
        tl.Dropout(rate=dropout, mode=mode),
    ]

    # TODO(kitaev): Memory-efficient attention. This chunking is temporary.
    self_attention = [
        Split(sections=n_attention_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
        Map(self_attention),
        tl.Concatenate(axis=-2),
    ]

    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        ReversibleResidual([self_attention], [feed_forward]),
    ]
Example #13
0
def DecoderBlock(d_feature, d_feedforward, n_heads, n_attention_chunks,
                 dropout, mode):
    """Reversible transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    n_attention_chunks: int: number of chunks for memory-efficient attention
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    self_attention = [
        tl.LayerNorm(),
        tl.Dup(),
        tl.Parallel([], tl.CausalMask(axis=-2)),  # Create mask.
        # TODO(kitaev): add dropout
        tl.Attention(d_feature, n_heads=n_heads, dropout=None, mode=mode),
        tl.Parallel([], tl.Drop()),  # Drop mask.
    ]

    # TODO(kitaev): Memory-efficient attention. This chunking is temporary.
    self_attention = [
        Chunk(sections=n_attention_chunks),  # pylint: disable=no-value-for-parameter
        self_attention,
        Unchunk(sections=n_attention_chunks),  # pylint: disable=no-value-for-parameter
    ]

    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        ReversibleResidual([self_attention], [feed_forward]),
    ]