Ejemplo n.º 1
0
def _FunnelBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode,
                 ff_activation, pool_layer, pool_size, strides, separate_cls):
    """Internal funnel block. Returns a list of layers implementing it.

  The input is an activation tensor.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each block; must
        be an activation-type subclass of `Layer`.
    pool_layer: Type of pooling layer used for downsampling;
        should be `tl.AvgPool` or `tl.MaxPool`.
    pool_size: Shape of window that gets reduced to a single vector value.
        If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
        must be a tuple of length :math:`n-2`.
    strides: Offsets from the location of one window to the locations of
        neighboring windows along each axis. If specified, must be a tuple of
        the same length as `pool_size`. If None, then offsets of 1 along each
        window axis, :math:`(1, ..., 1)`, will be used.
    separate_cls: If `True`, pooling in funnel blocks is not applied to
          embeddings of the first token (`cls` from BERT paper).
  Returns:
      A list of layers that maps (activations, mask) to (activations', mask).
  """
    pooling = PoolLayer(pool_layer, pool_size, strides, separate_cls)
    mask_pooling = MaskPool(pool_size, strides, separate_cls)

    attention = tl.AttentionQKV(d_model,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode)
    hidden_dropout = tl.Dropout(rate=dropout,
                                shared_axes=dropout_shared_axes,
                                mode=mode)

    feed_forward = _FeedForwardBlock(d_model, d_ff, dropout,
                                     dropout_shared_axes, mode, ff_activation)

    return [  # h, mask
        tl.LayerNorm(),  # h, mask
        tl.Branch(pooling, None),  # h', h, mask
        tl.Residual(
            tl.Select([0, 1, 1, 2]),  # h', h, h, mask
            attention,  # attn, mask
            tl.Parallel(None, mask_pooling),  # attn, mask'
            hidden_dropout  # attn, mask'
        ),  # funnel_activations, mask'
        tl.Residual(feed_forward)
    ]
Ejemplo n.º 2
0
def _FunnelRelativeDecoderBlock(d_model, d_ff, n_heads, dropout,
                                dropout_shared_axes, mode, ff_activation,
                                total_pooling, shorten_factor, resampler_fn):
  """Returns a list of layers that implements a Transformer decoder block.

  The input is an activation tensor.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
      Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
      a useful way to save memory and apply consistent masks to activation
      vectors at different sequence positions.
    mode: If `'train'`, each block will include dropout; else, it will
      pass all values through unaltered.
    ff_activation: Type of activation function at the end of each block; must
      be an activation-type subclass of `Layer`.
    total_pooling: total pooling.
    shorten_factor: by how much shorten/upsample at this funnel block.
    resampler_fn: Type of function that performs funnel upsampling/downsampling;
      callable with signature: shorten_factor, d_model;  must return an
      activation-type subclass of `Layer`.

  Returns:
    A list of layers that maps an activation tensor to an activation tensor.
  """
  resampler = resampler_fn(shorten_factor, d_model)

  attention = RelativeAttentionLMLayer(
      d_model, total_pooling, n_heads=n_heads, dropout=dropout, mode=mode)

  feed_forward = _FeedForwardBlock(
      d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)

  dropout_ = tl.Dropout(
      rate=dropout, shared_axes=dropout_shared_axes, mode=mode)

  return [
      tl.LayerNorm(),            # h
      tl.Branch(tl.Serial(
          resampler,
          tl.LayerNorm(),
      ), None),                  # h', h
      tl.Residual(
          tl.Select([0, 1, 1]),  # h', h, h
          attention,
          dropout_,
      ),
      tl.Residual(
          feed_forward
      ),
  ]
Ejemplo n.º 3
0
def _RelativeDecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                          mode, ff_activation, context_bias_layer,
                          location_bias_layer, total_pooling):
  """Returns a list of layers that implements a Transformer encoder block.

  The input to the block is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each block; must
        be an activation-type subclass of `Layer`.
    context_bias_layer: Global context bias from Transformer XL's attention.
    location_bias_layer: Global location bias from Transformer XL's attention.
    total_pooling: The combined pool size of previously used funnel blocks.

  Returns:
    A list of layers that maps (activations, att_vecs, mask) to
                               (activations, att_vecs, mask).
  """
  attention = RelativeAttentionLMLayer(
      d_model, context_bias_layer, location_bias_layer,
      total_pooling,
      n_heads=n_heads, dropout=dropout, mode=mode)

  feed_forward = _FeedForwardBlock(
      d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)

  dropout_ = tl.Dropout(
      rate=dropout, shared_axes=dropout_shared_axes, mode=mode)

  return [
      tl.Residual(               # vecs
          tl.LayerNorm(),
          tl.Select([0, 0, 0]),
          attention,
          dropout_,
      ),                         # vecs
      tl.Residual(
          feed_forward
      ),                         # vecs
  ]
Ejemplo n.º 4
0
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode):
  """Returns a list of layers that implements a Reformer encoder block.

  The input to the layer is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    mode: str: 'train' or 'eval'

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
  attention = tl.SelfAttention(
      n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads,
      masked=True,
      attention_dropout=0.0,  # TODO(kitaev): attention dropout
      mode=mode)
  attention_half_residual = ReversibleHalfResidualV2(
      tl.LayerNorm(),
      attention_layer=attention,
      # TODO(kitaev): add output dropout to attention layer. rate=dropout
  )

  # TODO(kitaev): Switch to FeedForward with BroadcastedDropout?
  feed_forward = transformer._FeedForwardBlock(  # pylint: disable=protected-access
      d_model, d_ff, dropout, -1, mode, ff_activation)
  # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode)

  return [
      attention_half_residual,
      tl.ReversibleSwap(),
      ReversibleHalfResidualV2(feed_forward),
      tl.ReversibleSwap(),
  ]
Ejemplo n.º 5
0
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode):
    """Returns a list of layers that implements a Reformer encoder block.

  The input to the layer is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    mode: str: 'train' or 'eval'

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
    pre_attention = tl.LayerNorm()
    attention = tl.Attention(d_model,
                             n_heads=n_heads,
                             dropout=dropout,
                             mode=mode)
    post_attention = tl.Dropout(rate=dropout,
                                name='dropout_enc_attn',
                                mode=mode)

    # TODO(kitaev): Switch to FeedForward with BroadcastedDropout?
    feed_forward = transformer._FeedForwardBlock(  # pylint: disable=protected-access
        d_model, d_ff, dropout, -1, mode, ff_activation)
    # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode)

    return [
        # TODO(kitaev): consider ReversibleAttentionHalfResidual for efficiency
        ReversibleHalfResidual([pre_attention, attention, post_attention]),
        tl.ReversibleSwap(),
        ReversibleHalfResidual(feed_forward),
        tl.ReversibleSwap(),
    ]