Esempio n. 1
0
 def PositionalEncoder(vocab_size):  # tokens --> vectors
   return [
       tl.Embedding(vocab_size, d_model),
       tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
       tl.PositionalEncoding(max_len=max_len),
   ]
Esempio n. 2
0
def FeedForwardWithOptions(d_model,
                           d_ff,
                           dropout,
                           dropout_shared_axes,
                           ff_activation,
                           ff_dropout,
                           ff_chunk_size,
                           ff_use_sru,
                           ff_sparsity,
                           mode,
                           use_bfloat16=False,
                           ff_sparsity_type='1inN'):
    """Feed-Forward block with all the options.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers
      in addition to the feed-forward block (second int specifies sru size)
    ff_sparsity: int, tuple or string; if not 0, use sparse feed-forward block
      with this sparsity
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    use_bfloat16: whether to use bfloat16 for weights (default: False).
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`

  Returns:
    A list of layers which maps vectors to vectors.
  """
    if ff_sparsity and ff_sparsity_type == '1inN':
        temperature, quant_prob = 0.1, 0.3
        if isinstance(ff_sparsity, str):
            # This is hacky but used to pass ff_sparsity in yaml sweep files.
            ff_sparsity = [(float(x) if '.' in x else int(x))
                           for x in ff_sparsity.split()]
        if isinstance(ff_sparsity, (list, tuple)):
            if len(ff_sparsity) == 2:
                n_elements_in_block, d_lowrank = ff_sparsity
            else:
                n_elements_in_block, d_lowrank, temperature, quant_prob = ff_sparsity
        else:
            assert isinstance(ff_sparsity, int)
            n_elements_in_block, d_lowrank = ff_sparsity, d_ff // ff_sparsity
        ff = tl.SparseFF(d_ff,
                         n_elements_in_block=n_elements_in_block,
                         d_lowrank=d_lowrank,
                         temperature=temperature,
                         quant_prob=quant_prob,
                         use_bfloat16=use_bfloat16,
                         mode=mode,
                         dropout_rate=dropout,
                         dropout_shared_axes=dropout_shared_axes,
                         ff_chunk_size=ff_chunk_size)
    elif ff_sparsity and ff_sparsity_type == 'Block':
        ff = tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode)
    else:
        ff = _FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout,
                          use_bfloat16, mode)
    res = [tl.LayerNorm(), ff]
    if ff_sparsity_type != '1inN' or ff_sparsity == 0:
        # SparseFF has Dropout and BatchLeadingAxes built-in.
        res.append(
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode))
        if ff_chunk_size > 0:
            res = tl.BatchLeadingAxes(tl.Chunk(tl.Serial(res), ff_chunk_size))
    if ff_use_sru:
        if isinstance(ff_use_sru, (list, tuple)):
            sru_n_layers, sru_n_units = ff_use_sru
        else:
            sru_n_layers, sru_n_units = ff_use_sru, 32
        sru = [tl.SRU(sru_n_units, mode=mode) for _ in range(sru_n_layers)]
        block = [tl.LayerNorm(), tl.Dense(sru_n_units)
                 ] + sru + [tl.Dense(d_model)]
        res = tl.Residual(block, shortcut=res)
    return [res]
Esempio n. 3
0
def DecoderBlock(d_model,
                 d_ff,
                 n_heads,
                 dropout,
                 dropout_shared_axes,
                 mode,
                 ff_activation,
                 ff_dropout,
                 ff_chunk_size,
                 ff_use_sru,
                 ff_sparsity,
                 ff_sparsity_type,
                 attention_chunk_size,
                 attention_type,
                 n_attention_layers=1,
                 n_feedforward_layers=1):
    """Returns a list of layers that implements a Transformer decoder block.

  The input is an activation tensor.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers
      in addition to the feed-forward block (second int specifies sru size)
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`
    attention_chunk_size: int, if > 0 run attention chunked at this size
    attention_type: The attention layer to use.
    n_attention_layers: how many residual causal attention layers should we
      have before the feed-forward block (default: 1, the standard block)
    n_feedforward_layers: how many FFNN layers should we have (default 1).

  Returns:
    A list of layers that maps an activation tensor to an activation tensor.
  """
    # pylint: disable=g-complex-comprehension
    causal_attentions = [
        ApplyAttentionLayer(attention_type,
                            d_model,
                            n_heads,
                            d_model // n_heads,
                            d_model // n_heads,
                            causal=True,
                            masked=False,
                            attention_dropout=dropout,
                            output_dropout=dropout,
                            attention_chunk_size=attention_chunk_size,
                            mode=mode) for _ in range(n_attention_layers)
    ]

    residual_attentions = [
        tl.Residual(
            tl.LayerNorm(), causal_attentions[i],
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)) for i in range(n_attention_layers)
    ]

    feed_forwards = [
        tl.Residual(
            FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes,
                                   ff_activation, ff_dropout, ff_chunk_size,
                                   ff_use_sru, ff_sparsity, mode, False,
                                   ff_sparsity_type))
        for _ in range(n_feedforward_layers)
    ]
    # pylint: enable=g-complex-comprehension

    return residual_attentions + feed_forwards
Esempio n. 4
0
 def test_call_in_eval_mode_does_no_dropout(self):
     layer = tl.Dropout(rate=0.1, mode='eval')
     x = np.ones((2, 5, 1000))
     y = layer(x)
     self.assertEqual(np.count_nonzero(y), 10_000)
def createPosEncoder(vocabSize, embeddingDepth, dropout, maxLength, mode):
    return [
        tl.Embedding(vocabSize, embeddingDepthembeddingDepth),
        tl.Dropout(rate=dropout, mode=mode),
        tl.PositionalEncoding(max_len=maxLength, mode=mode)
    ]
Esempio n. 6
0
def ReformerLM(vocab_size,
               d_model=512,
               d_ff=2048,
               d_attention_key=64,
               d_attention_value=64,
               n_layers=6,
               n_heads=8,
               dropout=0.1,
               max_len=2048,
               attention_type=tl.SelfAttention,
               axial_pos_shape=(),
               d_axial_pos_embs=None,
               ff_activation=tl.FastGelu,
               ff_use_sru=0,
               ff_chunk_size=0,
               mode='train'):
    """Reversible transformer language model (only uses a decoder, no encoder).

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    attention_type: class: attention class to use, such as SelfAttention.
    axial_pos_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    d_axial_pos_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match axial_pos_shape, and values must sum to d_model.
    ff_activation: the non-linearity in feed-forward layer
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    mode: str: 'train', 'eval', or 'predict'

  Returns:
    the layer.
  """
    positional_encoding = PositionalEncoding(mode, dropout, max_len,
                                             axial_pos_shape, d_axial_pos_embs)

    positional_embedder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),  # pylint: disable=no-value-for-parameter
        positional_encoding,
    ]

    decoder_blocks = []

    if isinstance(attention_type, (tuple, list)):
        assert n_layers % len(attention_type) == 0
    else:
        attention_type = [attention_type]
    for layer_idx in range(n_layers):
        layer_attention_type = attention_type[layer_idx % len(attention_type)]
        decoder_block = DecoderBlock(d_model,
                                     d_ff,
                                     d_attention_key,
                                     d_attention_value,
                                     n_heads,
                                     attention_type=layer_attention_type,
                                     dropout=dropout,
                                     ff_activation=ff_activation,
                                     ff_use_sru=ff_use_sru,
                                     ff_chunk_size=ff_chunk_size,
                                     mode=mode)
        decoder_blocks.append(decoder_block)

    return tl.Serial(
        tl.ShiftRight(mode=mode),
        positional_embedder,
        tl.Dup(),
        tl.ReversibleSerial(decoder_blocks),
        tl.Concatenate(),
        # TODO(kitaev): Test whether dropout should go before or after the
        # LayerNorm, and whether dropout broadcasting is needed here.
        tl.LayerNorm(),
        tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),  # pylint: disable=no-value-for-parameter
        tl.Dense(vocab_size),
        tl.LogSoftmax(),
    )
Esempio n. 7
0
def LayerDropTransformerLM(vocab_size,
                           d_model=512,
                           d_ff=2048,
                           n_layers=6,
                           n_heads=8,
                           dropout=0.1,
                           max_len=2048,
                           mode='train',
                           ff_activation=tl.Relu,
                           skip_fraction=0.4):
  """Returns a LayerDrop Transformer language model.

  The input to the model is a tensor of tokens. (This model uses only the
  decoder part of the overall Transformer.)

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference
    ff_activation: the non-linearity in feed-forward layer
    skip_fraction: probability of skipping a layer; it can be a single
        probability or a list of probabilities different for each layer

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
  embedder = [
      tl.Embedding(vocab_size, d_model),
      tl.Dropout(rate=dropout, mode=mode),
      tl.PositionalEncoding(max_len=max_len, mode=mode),
  ]

  if not isinstance(skip_fraction, (list, tuple)):
    # If we don't get a list of skip_fractions we use the same skip_fraction
    # for each layer.
    skip_fraction = [skip_fraction for i in range(n_layers)]
  if len(skip_fraction) != n_layers:
    raise ValueError('n_layers ({}) must be equal to len(skip_fraction) ({})'
                     .format(n_layers, len(skip_fraction)))

  def ConditionedBlock(current_layer_num):
    return tl.Serial(
        # stack: embedding
        tl.RandomUniform(0., 1, sync=True),
        # stack: random_uniform, embedding
        tl.Cond(
            # if random_uniform > skip_fraction
            LargerThan(skip_fraction[current_layer_num] if mode == 'train'
                       else 0.0),
            # then: run block
            tl.Serial(transformer._DecoderBlock(  # pylint: disable=g-complex-comprehension,protected-access
                d_model, d_ff, n_heads, dropout, [], mode, ff_activation)),
            # else: run noop
            tl.Serial()
            )
        # stack: embedding
        )

  return tl.Serial(
      tl.ShiftRight(mode=mode),
      embedder,
      [ConditionedBlock(i) for i in range(n_layers)],
      tl.LayerNorm(),
      tl.Dense(vocab_size),
      tl.LogSoftmax(),
  )
Esempio n. 8
0
def ReformerLM(vocab_size,
               d_model=512,
               d_ff=2048,
               d_attention_key=64,
               d_attention_value=64,
               n_layers=6,
               n_heads=8,
               dropout=0.1,
               max_len=2048,
               attention_type=tl.SelfAttention,
               pos_type=None,
               pos_axial_shape=(),
               pos_d_axial_embs=None,
               ff_activation=tl.FastGelu,
               ff_use_sru=0,
               ff_chunk_size=0,
               ff_sparsity=0,
               loss_sparsity_type='mult',
               loss_sparsity=0,
               loss_d_lowrank=0,
               loss_sparsity_prob=None,
               attention_chunk_size=0,
               mode='train'):
    """Reversible transformer language model (only uses a decoder, no encoder).

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    attention_type: class: attention class to use, such as SelfAttention.
    pos_type: string, the type of positional embeddings to use.
    pos_axial_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    pos_d_axial_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match pos_axial_shape, and values must sum to d_model.
    ff_activation: the non-linearity in feed-forward layer
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    loss_sparsity_type: str, type of sparsity to used in loss layer. See
      SparseDenseWithOptions for options. None if no sparsity should be used.
    loss_sparsity: int, the sparsity for loss layer (if used)
    loss_d_lowrank: int, the dimensions for intermediate layer (if used)
    loss_sparsity_prob: float, the probability for sparse version of loss to be
      used. If None, only sparse version is used.
    attention_chunk_size: int, if > 0 run attention chunked at this size
    mode: str: 'train', 'eval', or 'predict'

  Returns:
    the layer.
  """
    positional_encoding = ct.PositionalEncoder(mode, dropout, max_len,
                                               pos_type, pos_axial_shape,
                                               pos_d_axial_embs)

    positional_embedder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),  # pylint: disable=no-value-for-parameter
        positional_encoding,
    ]

    decoder_blocks = []

    if isinstance(attention_type, (tuple, list)):
        assert n_layers % len(attention_type) == 0
    else:
        attention_type = [attention_type]
    for layer_idx in range(n_layers):
        layer_attention_type = attention_type[layer_idx % len(attention_type)]
        decoder_block = DecoderBlock(d_model,
                                     d_ff,
                                     d_attention_key,
                                     d_attention_value,
                                     n_heads,
                                     attention_type=layer_attention_type,
                                     dropout=dropout,
                                     ff_activation=ff_activation,
                                     ff_dropout=dropout,
                                     ff_use_sru=ff_use_sru,
                                     ff_chunk_size=ff_chunk_size,
                                     ff_sparsity=ff_sparsity,
                                     attention_chunk_size=attention_chunk_size,
                                     mode=mode)
        decoder_blocks.append(decoder_block)

    dense_loss_layer = tl.SparseDenseWithOptions(
        vocab_size,
        d_input=d_model,
        sparsity_type=loss_sparsity_type,
        sparsity=loss_sparsity,
        d_lowrank=loss_d_lowrank,
        prob_sparse=loss_sparsity_prob,
        mode=mode)

    return tl.Serial(
        tl.ShiftRight(mode=mode),
        positional_embedder,
        tl.Dup(),
        tl.ReversibleSerial(decoder_blocks),
        tl.Concatenate(),
        # TODO(kitaev): Test whether dropout should go before or after the
        # LayerNorm, and whether dropout broadcasting is needed here.
        tl.LayerNorm(),
        tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),  # pylint: disable=no-value-for-parameter
        dense_loss_layer,
    )
Esempio n. 9
0
def EveryOtherLayerDropTransformerLM(vocab_size,
                                     d_model=512,
                                     d_ff=2048,
                                     n_layers=6,
                                     n_heads=8,
                                     dropout=0.1,
                                     max_len=2048,
                                     mode='train',
                                     ff_activation=tl.Relu,
                                     skip_mode='even',
                                     skip_fraction=0.5,
                                     eval_skip_fraction=0.0):
    """Returns an "EveryOther" LayerDrop Transformer language model.

  During each training step it either runs all layers, or skips a subset of
  layers. This subset is the same every time, and it is specified by
  "skip_mode".
  The input to the model is a tensor of tokens. (This model uses only the
  decoder part of the overall Transformer.)

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference
    ff_activation: the non-linearity in feed-forward layer
    skip_mode: which layers to skip when skipping: even/odd/1half/2half.
    skip_fraction: fraction of times to skip layers
    eval_skip_fraction: fraction of times to skip layers during eval

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
    embedder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, mode=mode),
        tl.PositionalEncoding(max_len=max_len, mode=mode),
    ]

    if mode == 'train':
        pass
    else:
        skip_fraction = eval_skip_fraction

    skip_mode_funs = {  # which layers should be skipped?
        'even': (lambda num: num%2 == 0),  # 0th layer is even
        'odd': (lambda num: num%2 == 1),
        '1half': (lambda num: num < (n_layers/2)),
        '2half': (lambda num: num >= (n_layers/2)),
    }

    skip_mode_fun = skip_mode_funs[skip_mode]

    @assert_shape('...sd,->...sd,')
    def ConditionedBlock(current_layer_num):
        return tl.Serial(
            # stack: embedding, n_layers_to_keep
            tl.Select([1, 0,
                       1]),  # n_layers_to_keep, embedding, n_layers_to_keep
            tl.Cond(
                # if random() > skip_fraction OR layer not in skip_mode ...
                LargerThan(skip_fraction if skip_mode_fun(current_layer_num
                                                          ) else 0.0),
                # then: run block
                tl.Serial(
                    transformer._DecoderBlock(  # pylint: disable=g-complex-comprehension,protected-access
                        d_model, d_ff, n_heads, dropout, [], mode,
                        ff_activation))
                # else: noop (implicit)
            )
            # stack: embedding, n_layers_to_keep
        )

    return tl.Serial(
        tl.ShiftRight(mode=mode),
        embedder,
        # stack: embedding
        tl.RandomUniform(0., 1., sync=True),
        # stack: n_layers_to_keep, embedding
        tl.Swap(),
        # stack: embedding, n_layers_to_keep
        [ConditionedBlock(i) for i in range(n_layers)],
        # stack: embedding, n_layers_to_keep
        tl.Select([0], n_in=2),  # stack: embedding
        tl.LayerNorm(),
        tl.Dense(vocab_size),
    )
def _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode,
                  ff_activation, ff_dropout, ff_chunk_size, ff_use_sru,
                  ff_sparsity, ff_sparsity_type, attention_chunk_size,
                  attention_type):
    """Returns a list of layers that implements a Transformer decoder block.

  The input is an activation tensor.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`
    attention_chunk_size: int, if > 0 run attention chunked at this size
    attention_type: The attention layer to use.

  Returns:
    A list of layers that maps an activation tensor to an activation tensor.
  """
    causal_attention = ApplyAttentionLayer(
        attention_type,
        d_model,
        n_heads,
        d_model // n_heads,
        d_model // n_heads,
        causal=True,
        masked=False,
        attention_dropout=dropout,
        output_dropout=dropout,
        attention_chunk_size=attention_chunk_size,
        mode=mode)

    feed_forward = FeedForwardWithOptions(d_model, d_ff, dropout,
                                          dropout_shared_axes, ff_activation,
                                          ff_dropout, ff_chunk_size,
                                          ff_use_sru, ff_sparsity, mode,
                                          ff_sparsity_type)

    dropout_ = tl.Dropout(rate=dropout,
                          shared_axes=dropout_shared_axes,
                          mode=mode)

    return [
        tl.Residual(
            tl.LayerNorm(),
            causal_attention,
            dropout_,
        ),
        tl.Residual(feed_forward),
    ]
Esempio n. 11
0
    def test_run_reversible_same_as_default_extended(self):
        """Runs the reversible trainer, check results are the same as default."""
        inputs_batch = np.arange(8).reshape((2, 4))
        targets_batch = 2 * inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))
        # We want to test rng propagation too, so adding some dropout layers.
        first_layer = tl.Serial(tl.Embedding(9, 4), tl.Dropout(0.5), tl.Dup())
        rev_layers1 = [
            tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.2)),
            tl.ReversibleSwap(),
            tl.ReversibleHalfResidual(tl.Dropout(0.5), tl.Dense(4)),
            tl.ReversibleSwap()
        ]
        mid_layer = tl.Serial(tl.Add(), tl.Dense(4), tl.Dup())
        rev_layers2 = [
            tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.3)),
            tl.ReversibleSwap()
        ]
        loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(19), tl.Dropout(0.3),
                               tl.LogSoftmax(), tl.CrossEntropyLoss())
        model = tl.Serial([first_layer] + rev_layers1 + [mid_layer] +
                          rev_layers2 + [loss_layer])
        rng_init = fastmath.random.get_prng(12)
        model.init(labeled_batch, rng=rng_init)
        optimizer_fn = optimizers.Adam  # to test slots

        # Make 3 steps with the original trainer.
        optimizer = optimizer_fn()
        optimizer.tree_init(model.weights)
        trainer = optimizers.Trainer(model, optimizer)
        rng_step1 = fastmath.random.get_prng(7)
        rng_step2 = fastmath.random.get_prng(8)
        rng_step3 = fastmath.random.get_prng(9)
        trainer.one_step(labeled_batch, rng_step1)
        trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02)
        trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03)
        first_layer_weights1 = first_layer.weights
        rev_layer12_weights1 = rev_layers1[2].weights
        mid_layer_weights1 = mid_layer.weights
        rev_layer20_weights1 = rev_layers2[0].weights
        loss_layer_weights1 = loss_layer.weights

        # Now make 3 steps with reversible trainer.
        model.init(labeled_batch, rng=rng_init)
        # TODO(lukaszkaiser): this test seems to fail with memoize_jit, why?
        trainer = optimizers.ReversibleSerialTrainer(
            [(first_layer.sublayers, rev_layers1),
             (mid_layer.sublayers, rev_layers2)],
            loss_layer,
            optimizer_fn,
            memoize_jit=False)
        trainer.one_step(labeled_batch, rng_step1)
        trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02)
        trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03)

        # Check that weights end up the same.
        self._assert_all_equal(loss_layer_weights1, loss_layer.weights)
        self._assert_all_equal(rev_layer20_weights1, rev_layers2[0].weights)
        self._assert_all_equal(mid_layer_weights1, mid_layer.weights)
        self._assert_all_equal(rev_layer12_weights1, rev_layers1[2].weights)
        self._assert_all_equal(first_layer_weights1, first_layer.weights)
def FeedForwardWithOptions(d_model,
                           d_ff,
                           dropout,
                           dropout_shared_axes,
                           ff_activation,
                           ff_dropout,
                           ff_chunk_size,
                           ff_use_sru,
                           ff_sparsity,
                           mode,
                           ff_sparsity_type='1inN'):
    """Feed-Forward block with all the options.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`

  Returns:
    A list of layers which maps vectors to vectors.
  """
    if ff_use_sru:
        return [tl.SRU(d_model) for _ in range(ff_use_sru)]
    elif ff_sparsity and ff_sparsity_type == '1inN':
        ff = tl.SparseFF(d_ff,
                         n_elements_in_block=ff_sparsity,
                         d_lowrank=d_ff // ff_sparsity,
                         mode=mode)
        if ff_chunk_size < 1:
            chunked_ff = ff
        else:
            chunked_ff = tl.BatchLeadingAxes(
                tl.Chunk(tl.Serial(ff), ff_chunk_size))
        return [
            tl.LayerNorm(), chunked_ff,
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
        ]
    elif ff_sparsity and ff_sparsity_type == 'Block':
        return [
            tl.LayerNorm(),
            tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode),
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
        ]
    else:
        return [
            ChunkedFeedForward(d_model, d_ff, dropout, ff_activation,
                               ff_dropout, ff_chunk_size, mode)
        ]
Esempio n. 13
0
 def Embedder(vocab_size):  # tokens --> vectors
     return [
         tl.Embedding(vocab_size, d_model),
         tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),
     ]
Esempio n. 14
0
def HourglassLM(vocab_size,
                d_model=512,
                d_ff=2048,
                vanilla_layers=(1, 1),
                hierarchy='6@3',
                n_heads=8,
                dropout=0.1,
                dropout_shared_axes=None,
                mode='train',
                ff_activation=tl.FastGelu,
                vanilla_attn_type=RelativeAttentionWrapper,
                middle_attn_type=RelativeAttentionWrapper,
                downsampling_fn=AttentionResampling,
                upsampling_fn=AttentionResampling,
                attention_downsampling_fn=AveragePooling,
                attention_upsampling_fn=LinearUpsampling):
    """Returns a hierarchical Transformer language model.

  This model performs autoregressive language modeling:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 3 tensor representing a batch of log-probability
      distributions for each sequence position over possible token IDs;
      shape is (batch_size, sequence_length, `vocab_size`).

  This model uses only the decoder part of the overall Transformer.

  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor should
      be an integer in `range(vocab_size)`. These integers typically represent
      token IDs from a vocabulary-based tokenizer.
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
      block.
    vanilla_layers: (pre_layers, post_layers) tuple - number of full token-level
      Transformer decoder layers before and after shortening.
    hierarchy: string - shortening hierarchy, as described in the paper.
      Hierarchy levels must form a palindrome, e.g. '1@2 2@6 1@2'.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    mode: str: 'train' or 'eval'.
    ff_activation: Type of activation function at the end of each encoder block;
      must be an activation-type subclass of `Layer`.
    vanilla_attn_type: class: attention class such as SelfAttention to use in
      the layers before and after shortening (vanilla layers).
    middle_attn_type: class: attention class to use in the middle layers (these
      operating on the shortened sequence).
    downsampling_fn: function that takes full token-level vectors of length `l`
      and transforms them into `l` / `k` vectors, where `k` denotes
      `shorten_factor` parameter.
    upsampling_fn: function that takes shortened representations of a sequence,
      consisting of `l` / `k` vectors and transforms them into full token-level
      representations of length `l`.
    attention_downsampling_fn: Downsampling function that transforms token-level
      vectors into query vectors with reduced length. Necessary only when
      AttentionResampling is used as `downsampling_fn`.
    attention_upsampling_fn: Upsampling function for AttentionResampling. Valid
      only when AttentionResampling is used as a `upsampling_fn`.

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
    assert mode != 'predict'  # For now, 'predict' mode is unsupported.
    hierarchy_n_layers, hierarchy_shorten_factors = _parse_hierarchy(hierarchy)

    token_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode)
    ]

    context_bias_layer, location_bias_layer = get_rel_att_inputs(
        d_model, n_heads)

    n_pre_decoder_blocks, n_post_decoder_blocks = vanilla_layers

    def create_decoder_blocks(
            n_layers,
            total_pooling,  # pylint: disable = invalid-name
            attention_type):
        decoder_blocks = [
            # pylint: disable=g-complex-comprehension
            _RelativeDecoderBlock(attention_type, d_model, d_ff, n_heads,
                                  dropout, dropout_shared_axes, mode,
                                  ff_activation, context_bias_layer,
                                  location_bias_layer, total_pooling)
            for _ in range(n_layers)
        ]
        return decoder_blocks + [tl.LayerNorm()]

    def create_hourglass_valley(
            rest_shorten_factors,
            rest_n_funnel_blocks,  # pylint: disable = invalid-name
            current_total_pooling):
        assert rest_shorten_factors
        assert len(rest_shorten_factors) == len(rest_n_funnel_blocks)

        current_sf = rest_shorten_factors[0]
        current_n_layers = rest_n_funnel_blocks[0]

        shortening_layer = downsampling_fn(
            current_sf,
            d_model,
            is_upsampling=False,
            d_ff=d_ff,
            n_heads=n_heads,
            dropout=dropout,
            dropout_shared_axes=dropout_shared_axes,
            mode=mode,
            ff_activation=ff_activation,
            context_bias_layer=context_bias_layer,
            location_bias_layer=location_bias_layer,
            total_pooling=current_total_pooling,
            resampling_fn=attention_downsampling_fn)

        upsampling_layer = upsampling_fn(
            current_sf,
            d_model=d_model,
            is_upsampling=True,
            d_ff=d_ff,
            n_heads=n_heads,
            dropout=dropout,
            dropout_shared_axes=dropout_shared_axes,
            mode=mode,
            ff_activation=ff_activation,
            context_bias_layer=context_bias_layer,
            location_bias_layer=location_bias_layer,
            total_pooling=current_total_pooling,
            resampling_fn=attention_upsampling_fn)

        if len(rest_shorten_factors) > 1:  # we need to go deeper again
            pre_stage_blocks = create_decoder_blocks(
                current_n_layers, current_total_pooling * current_sf,
                middle_attn_type)

            post_stage_blocks = create_decoder_blocks(
                current_n_layers, current_total_pooling * current_sf,
                middle_attn_type)

            return [
                tl.Dup(),
                tl.ShiftRight(current_sf - 1, mode=mode), shortening_layer,
                pre_stage_blocks, *create_hourglass_valley(
                    rest_shorten_factors[1:], rest_n_funnel_blocks[1:],
                    current_total_pooling * current_sf), post_stage_blocks,
                upsampling_layer,
                tl.LayerNorm(),
                tl.Add()
            ]
        else:
            blocks = create_decoder_blocks(current_n_layers,
                                           current_total_pooling * current_sf,
                                           middle_attn_type)

            return [
                tl.Dup(),
                tl.ShiftRight(current_sf - 1), shortening_layer, blocks,
                upsampling_layer,
                tl.LayerNorm(),
                tl.Add()
            ]

    pre_decoder_blocks = create_decoder_blocks(n_pre_decoder_blocks, 1,
                                               vanilla_attn_type)

    post_decoder_blocks = create_decoder_blocks(n_post_decoder_blocks, 1,
                                                vanilla_attn_type)

    valley = create_hourglass_valley(hierarchy_shorten_factors,
                                     hierarchy_n_layers, 1)

    # Assemble and return the model.
    return tl.Serial(  # tokens (or chunked tuple of tokens)
        tl.ShiftRight(mode=mode),  # toks
        token_encoder,  # vecs
        pre_decoder_blocks,  # vecs
        valley,  # shortened vecs
        post_decoder_blocks,  # vecs
        tl.Dense(vocab_size),  # vecs
    )
Esempio n. 15
0
def FunnelTransformerEncoder(vocab_size,
                             n_classes=10,
                             d_model=512,
                             d_ff=2048,
                             encoder_segment_lengths=(2, 2, 2),
                             n_heads=8,
                             max_len=2048,
                             dropout=0.1,
                             dropout_shared_axes=None,
                             mode='train',
                             ff_activation=tl.Relu,
                             pool_layer=tl.AvgPool,
                             pool_size=(2, ),
                             strides=(2, ),
                             separate_cls=True):
    """Returns a Funnel Encoder.

  This model performs text categorization:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 2 tensor representing a batch of log-probability
      distributions over N categories; shape is (batch_size, `n_classes`).

  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor
        should be an integer in `range(vocab_size)`. These integers typically
        represent token IDs from a vocabulary-based tokenizer.
    n_classes: Final dimension of the output tensors, representing N-way
        classification.
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
        block.
    encoder_segment_lengths: Tuple, where each element denotes the number of
        transformer encoder blocks preceding a funnel transformer block.
        There is no funnel block after the last sequence of encoder blocks,
        therefore the total number of blocks in the model is equal to
        `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each encoder block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each encoder
        block; must be an activation-type subclass of `Layer`.
    pool_layer: Type of pooling layer used for downsampling in each of the
        funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`.
    pool_size: Shape of window that gets reduced to a single vector value.
        If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
        must be a tuple of length :math:`n-2`.
    strides: Offsets from the location of one window to the locations of
        neighboring windows along each axis. If specified, must be a tuple of
        the same length as `pool_size`. If None, then offsets of 1 along each
        window axis, :math:`(1, ..., 1)`, will be used.
    separate_cls: If `True`, pooling in funnel blocks is not applied to
        embeddings of the first token (`cls` from BERT paper) and only final
        embedding of this token is used for categorization - the rest are
        discarded. If `False`, each token from the beginning is pooled and
        all embeddings are averaged and mapped to output categories like in
        original `TransformerEncoder` model.
  Returns:
    A Transformer model that maps strings (conveyed via token IDs) to
    probability-like activations over a range of output classes.
  """
    assert encoder_segment_lengths

    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        tl.PositionalEncoding(max_len=max_len)
    ]

    encoder_blocks = []
    n_encoder_segments = len(encoder_segment_lengths)

    for i in range(n_encoder_segments):
        # Building i'th segment
        for _ in range(encoder_segment_lengths[i]):
            # Create segment_size encoder blocks
            encoder_blocks.append(
                _EncoderBlock(d_model, d_ff, n_heads, dropout,
                              dropout_shared_axes, mode, ff_activation))

        # If not last segment, add funnel block
        if i != n_encoder_segments - 1:
            encoder_blocks.append(
                _FunnelBlock(d_model, d_ff, n_heads, dropout,
                             dropout_shared_axes, mode, ff_activation,
                             pool_layer, pool_size, strides, separate_cls))

    cls_pooling = SelectFirst() if separate_cls else tl.Mean(axis=1)

    # Assemble and return the model.
    return tl.Serial(  # toks
        # Encode.
        tl.Branch(positional_encoder, tl.PaddingMask()),  # vecs masks
        encoder_blocks,  # vecs masks
        tl.Select([0], n_in=2),  # vecs
        tl.LayerNorm(),  # vecs

        # Map to output categories.
        cls_pooling,  # cls
        tl.Dense(n_classes),  # cls
        tl.LogSoftmax(),  # cls
    )
Esempio n. 16
0
def LayerDropTransformerLM(vocab_size,
                           d_model=512,
                           d_ff=2048,
                           n_layers=6,
                           n_heads=8,
                           dropout=0.1,
                           max_len=2048,
                           mode='train',
                           ff_activation=tl.Relu,
                           skip_fraction=0.4,
                           eval_skip_fraction='every_other'):
    """Returns a LayerDrop Transformer language model.

  Based on Fan, Grave, Joulin 2019, https://arxiv.org/abs/1909.11556 .

  The input to the model is a tensor of tokens. (This model uses only the
  decoder part of the overall Transformer.)

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference
    ff_activation: the non-linearity in feed-forward layer
    skip_fraction: probability of skipping a layer; it can be a single
        probability or a list of probabilities different for each layer
    eval_skip_fraction: probability of skipping a layer during eval; it can be a
        single probability, or a list of probabilities different for each layer,
        or a string "every other" implementing a strategy from original paper

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
    embedder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, mode=mode),
        tl.PositionalEncoding(max_len=max_len, mode=mode),
    ]

    if not isinstance(skip_fraction, (list, tuple)):
        # If we don't get a list of skip_fractions we use the same skip_fraction
        # for each layer.
        skip_fraction = [skip_fraction for i in range(n_layers)]
    if len(skip_fraction) != n_layers:
        raise ValueError(
            'n_layers ({}) must be equal to len(skip_fraction) ({})'.format(
                n_layers, len(skip_fraction)))

    if eval_skip_fraction == 'every_other':
        # 100% skipping for even-numbered layers; 0% for odd-numbered layers.
        eval_skip_fraction = [
            (1.0 if i % int(1. / skip_fraction[i]) == 0 else 0.0)
            if skip_fraction[i] != 0 else 0.0 for i in range(n_layers)
        ]
    if eval_skip_fraction == 'same':
        # Same skip_fraction as in training.
        eval_skip_fraction = skip_fraction
    if not isinstance(eval_skip_fraction, (list, tuple)):
        # If we don't get a list of eval_skip_fractions we use the same
        # eval_skip_fraction for each layer.
        eval_skip_fraction = [eval_skip_fraction for i in range(n_layers)]
    if len(eval_skip_fraction) != n_layers:
        raise ValueError(
            'n_layers ({}) must be equal to len(eval_skip_fraction) ({})'.
            format(n_layers, len(eval_skip_fraction)))

    @assert_shape('...sd->...sd')
    def ConditionedBlock(current_layer_num):
        return tl.Serial(
            # stack: embedding
            tl.RandomUniform(0., 1, sync=True),
            # stack: random_uniform, embedding
            tl.Cond(
                # if random_uniform > skip_fraction
                LargerThan(skip_fraction[current_layer_num] if mode ==
                           'train' else eval_skip_fraction[current_layer_num]),
                # then: run block
                tl.Serial(
                    transformer._DecoderBlock(  # pylint: disable=g-complex-comprehension,protected-access
                        d_model, d_ff, n_heads, dropout, [], mode,
                        ff_activation)),
                # else: run noop
                tl.Serial())
            # stack: embedding
        )

    return tl.Serial(
        tl.ShiftRight(mode=mode),
        embedder,
        [ConditionedBlock(i) for i in range(n_layers)],
        tl.LayerNorm(),
        tl.Dense(vocab_size),
    )
Esempio n. 17
0
def FunnelTransformer(vocab_size,
                      d_model=512,
                      d_ff=2048,
                      encoder_segment_lengths=(2, 2, 2),
                      n_decoder_blocks=2,
                      n_heads=8,
                      max_len=2048,
                      dropout=0.1,
                      dropout_shared_axes=None,
                      mode='train',
                      ff_activation=tl.Relu,
                      pool_layer=tl.AvgPool,
                      pool_size=(2, ),
                      separate_cls=True):
    """Returns a Full Funnel Transformer, that can be used for example for BERT.

  This model outputs token-level categorical distributions over all vocab:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 3 tensor representing a batch of log-probability
      distributions over `vocab_size` categories for each token; shape is
      (batch_size, sequence_length, vocab_size).


  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor
        should be an integer in `range(vocab_size)`. These integers typically
        represent token IDs from a vocabulary-based tokenizer.
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
        block.
    encoder_segment_lengths: Tuple, where each element denotes the number of
        transformer encoder blocks preceding a funnel transformer block.
        There is no funnel block after the last sequence of encoder blocks,
        therefore the total number of blocks in the model is equal to
        `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`.
    n_decoder_blocks: Number of transformer blocks in the upsampling decoder.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each encoder block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each encoder
        block; must be an activation-type subclass of `Layer`.
    pool_layer: Type of pooling layer used for downsampling in each of the
        funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`.
    pool_size: Shape of window that gets reduced to a single vector value.
        If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
        must be a tuple of length :math:`n-2`.
    separate_cls: If `True`, pooling in funnel blocks is not applied to
        embeddings of the first token (`cls` from BERT paper) and only final
        embedding of this token is used for categorization - the rest are
        discarded. If `False`, each token from the beginning is pooled and
        all embeddings are averaged and mapped to output categories like in
        original `TransformerEncoder` model.
  """
    assert encoder_segment_lengths

    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        tl.PositionalEncoding(max_len=max_len)
    ]

    n_encoder_segments = len(encoder_segment_lengths)

    encoder_blocks_before_first_pooling = [
        _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation)
        for _ in range(encoder_segment_lengths[0])
    ]
    encoder_blocks_from_first_pooling = []

    for i in range(1, n_encoder_segments):
        # Building i'th segment

        # Add funnel block between segments
        encoder_blocks_from_first_pooling.append(
            _FunnelBlock(d_model,
                         d_ff,
                         n_heads,
                         dropout,
                         dropout_shared_axes,
                         mode,
                         ff_activation,
                         pool_layer,
                         pool_size=pool_size,
                         strides=pool_size,
                         separate_cls=separate_cls))

        for _ in range(encoder_segment_lengths[i]):
            # Create segment_size encoder blocks
            encoder_blocks_from_first_pooling.append(
                _EncoderBlock(d_model, d_ff, n_heads, dropout,
                              dropout_shared_axes, mode, ff_activation))

    decoder_blocks = [
        _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation) for _ in range(n_decoder_blocks)
    ]

    total_pool_size = pool_size[0]**(len(encoder_segment_lengths) - 1)

    # Assemble and return the model.
    return tl.Serial(  # toks
        tl.Branch(positional_encoder, tl.PaddingMask()),  # vecs masks
        encoder_blocks_before_first_pooling,  # vecs masks
        tl.Select([0, 1, 0, 1]),
        # vecs masks residual = vecs old_masks
        encoder_blocks_from_first_pooling,  # vecs masks residual masks
        tl.Select([0, 2, 3]),  # vecs residual masks
        tl.Parallel(
            # residual from first segment is taken before
            # normalization, so apply it now
            None,
            tl.LayerNorm(),
            None),  # vecs norm(residual) masks
        _Upsampler(total_pool_size, separate_cls),  # vecs masks
        decoder_blocks,
        tl.Select([0], n_in=2),  # vecs
        tl.LayerNorm(),
        tl.Dense(vocab_size),
        tl.LogSoftmax())
Esempio n. 18
0
 def model_fn(mode='train'):
     return tl.Serial(
         tl.Dropout(mode=mode, rate=0.1), tl.BatchNorm(mode=mode),
         models.MLP(d_hidden=16,
                    n_output_classes=n_classes,
                    mode=mode))
Esempio n. 19
0
def ReformerShortenLM(vocab_size,
                      shorten_factor=1,
                      d_embedding=256,
                      d_model=512,
                      d_ff=2048,
                      d_attention_key=64,
                      d_attention_value=64,
                      n_layers=6,
                      n_heads=8,
                      dropout=0.1,
                      max_len=2048,
                      attention_type=tl.SelfAttention,
                      axial_pos_shape=(),
                      d_axial_pos_embs=None,
                      ff_activation=tl.FastGelu,
                      ff_use_sru=0,
                      ff_chunk_size=0,
                      mode='train'):
    """Reversible transformer language model with shortening.

  When shorten_factor is F and processing an input of shape [batch, length],
  we embed the (shifted-right) input and then group each F elements (on length)
  into a single vector -- so that in the end we process a tensor of shape ::

      [batch, length // F, d_model]

  almost until the end -- at the end it's un-shortend and a SRU is applied.
  This reduces the length processed inside the main model body, effectively
  making the model faster but possibly slightly less accurate.

  Args:
    vocab_size: int: vocab size
    shorten_factor: by how much to shorten, see above
    d_embedding: the depth of the embedding layer and final logits
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    attention_type: class: attention class to use, such as SelfAttention.
    axial_pos_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    d_axial_pos_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match axial_pos_shape, values must sum to d_embedding.
    ff_activation: the non-linearity in feed-forward layer
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    assert mode != 'predict'  # TODO(lukaszkaiser,kitaev): fast inference

    if not axial_pos_shape:
        positional_encoding = tl.PositionalEncoding(max_len=max_len,
                                                    dropout=dropout,
                                                    mode=mode)
    else:
        assert d_axial_pos_embs is not None
        positional_encoding = tl.AxialPositionalEncoding(
            shape=axial_pos_shape,
            d_embs=d_axial_pos_embs,
            dropout_broadcast_dims=tuple(range(1,
                                               len(axial_pos_shape) + 1)),
            dropout=dropout,
            mode=mode)

    positional_embedder = [
        tl.Embedding(vocab_size, d_embedding),
        tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),  # pylint: disable=no-value-for-parameter
        positional_encoding,
    ]

    decoder_blocks = []

    if isinstance(attention_type, (tuple, list)):
        assert n_layers % len(attention_type) == 0
    else:
        attention_type = [attention_type]
    for layer_idx in range(n_layers):
        layer_attention_type = attention_type[layer_idx % len(attention_type)]
        decoder_block = DecoderBlock(d_model,
                                     d_ff,
                                     d_attention_key,
                                     d_attention_value,
                                     n_heads,
                                     attention_type=layer_attention_type,
                                     dropout=dropout,
                                     ff_activation=ff_activation,
                                     ff_use_sru=ff_use_sru,
                                     ff_chunk_size=ff_chunk_size,
                                     mode=mode)
        decoder_blocks.append(decoder_block)

    # pylint: disable=g-long-lambda
    return tl.Serial(
        tl.ShiftRight(),
        positional_embedder,
        tl.Dup(),  # Stack has (x, x), the first will be shortened
        # Before shortening, we need to pad by shorten factor so as not to leak
        # information into the future. To understand why, imagine shorten factor
        # of 2 and sequence of length 4, so ABCD. If we shift just by 1, then we
        # would have 0ABC, which gets grouped to [0A][BC] on input, which is
        # predicting ABCD as targets. The problem is that [0A] has access to A
        # and [BC] has access to C -- it will learn to copy it, peek into
        # the future. Shifting twice to [00][AB] solves the problem as the first
        # "big" symbol becomes all-0 and the rest is shifted enough.
        tl.ShiftRight(n_positions=shorten_factor - 1),
        tl.Fn(
            'Shorten',
            lambda x: jnp.reshape(  # Shorten -- move to depth.
                x, (x.shape[0], x.shape[1] // shorten_factor, -1)),
            n_out=1),
        tl.Dense(d_model),
        tl.Dup(),  # Stack has (short_x, short_x, x)
        tl.ReversibleSerial(decoder_blocks),
        tl.Select([0], n_in=2),
        tl.LayerNorm(),
        tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode),  # pylint: disable=no-value-for-parameter
        tl.Dense(shorten_factor * d_embedding),
        tl.Fn(
            'ProlongBack',
            lambda x: jnp.reshape(  # Prolong back.
                x, (x.shape[0], x.shape[1] * shorten_factor, -1)),
            n_out=1),
        tl.Concatenate(),  # Concatenate with just the embeddings.
        tl.CausalConv(d_embedding),
        tl.Relu(),
        tl.SRU(d_embedding),  # One RNN layer for conditional dependence.
        tl.Dense(vocab_size),
        tl.LogSoftmax())
Esempio n. 20
0
def TransformerDecoder(vocab_size=None,
                       d_model=512,
                       d_ff=2048,
                       n_layers=6,
                       n_heads=8,
                       max_len=2048,
                       dropout=0.1,
                       dropout_shared_axes=None,
                       mode='train',
                       ff_activation=tl.Relu):
    """Returns a Transformer decoder.

  This model maps sequential inputs to sequential outputs:

    - input if `vocab_size` is specified: rank 2 tensor representing a batch
      of text strings via token IDs plus padding markers; shape is
      (batch_size, sequence_length). The tensor elements are integers in
      `range(vocab_size)`, and `0` values mark padding positions.

    - input if `vocab_size` is None: rank 3 tensor representing a batch
      of activation vectors; shape is (batch_size, sequence_length, `d_model`).

    - output: rank 3 tensor with shape (batch_size, sequence_length, `d_model`).

  The model uses causal attention and does *not* shift the input to the right.
  Thus, the output for position `t` is based on inputs up to and including
  position `t`.

  Args:
    vocab_size: If specified, gives the input vocabulary size -- each element
        of the input tensor should be an integer in `range(vocab_size)`.
        If None, indicates that the model expects as input floating point
        vectors, each with `d_model` components.
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each decoder
        block.
    n_layers: Number of decoder blocks. Each block includes attention, dropout,
        residual, feed-forward (`Dense`), and activation layers.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within a decoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each decoder block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each decoder
        block; must be an activation-type subclass of `Layer`.

  Returns:
    If `vocab_size` is defined: a Transformer model that maps strings (conveyed
    via token IDs) to sequences of activation vectors.

    If `vocab_size` is None: a Transformer model that maps sequences of
    activation vectors to sequences of activation vectors.
  """
    positional_encoder = [(tl.Embedding(vocab_size, d_model)
                           if vocab_size is not None else tl.Dense(d_model)),
                          tl.Dropout(rate=dropout,
                                     shared_axes=dropout_shared_axes,
                                     mode=mode),
                          tl.PositionalEncoding(max_len=max_len)]

    decoder_blocks = [
        # pylint: disable=g-complex-comprehension
        _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation) for i in range(n_layers)
    ]

    # Assemble and return the model.
    return tl.Serial(  # toks
        positional_encoder,  # vecs
        decoder_blocks,  # vecs
        tl.LayerNorm(),  # vecs
    )
Esempio n. 21
0
def LayerDropSkippingTransformerLM(vocab_size,
                                   d_model=512,
                                   d_ff=2048,
                                   n_layers=6,
                                   n_heads=8,
                                   dropout=0.1,
                                   max_len=2048,
                                   mode='train',
                                   ff_activation=tl.Relu,
                                   skip_fraction=0.4):
  """Returns a Skipping Transformer language model.

  The input to the model is a tensor of tokens. (This model uses only the
  decoder part of the overall Transformer.)

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference
    ff_activation: the non-linearity in feed-forward layer
    skip_fraction: fraction of times to skip some layers

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
  embedder = [
      tl.Embedding(vocab_size, d_model),
      tl.Dropout(rate=dropout, mode=mode),
      tl.PositionalEncoding(max_len=max_len, mode=mode),
  ]

  def ConditionedBlock(current_layer_num):
    return tl.Serial(
        # stack: embedding, n_layers_to_keep
        tl.Select([1, 0, 1]),  # n_layers_to_keep, embedding, n_layers_to_keep
        tl.Cond(
            # if n_layers_to_keep > current_layer_num
            LargerThan(float(current_layer_num)),
            # then: run block
            tl.Serial(transformer._DecoderBlock(  # pylint: disable=g-complex-comprehension,protected-access
                d_model, d_ff, n_heads, dropout, [], mode, ff_activation)),
            # else: run noop
            tl.Serial()
            )
        # stack: embedding, n_layers_to_keep
        )

  if mode == 'train':
    minimum_layers = 0.0
    maximum_layers = float(n_layers) / skip_fraction
  else:
    minimum_layers = maximum_layers = float(n_layers)

  return tl.Serial(
      tl.ShiftRight(mode=mode),
      embedder,
      # stack: embedding
      tl.RandomUniform(minimum_layers, maximum_layers, sync=True),
      # stack: n_layers_to_keep, embedding
      tl.Swap(),
      # stack: embedding, n_layers_to_keep
      [ConditionedBlock(i) for i in range(n_layers)],
      # stack: embedding, n_layers_to_keep
      tl.Select([0], n_in=2),  # stack: embedding
      tl.LayerNorm(),
      tl.Dense(vocab_size),
      tl.LogSoftmax(),
  )
Esempio n. 22
0
def TransformerLM(vocab_size,
                  d_model=512,
                  d_ff=2048,
                  n_layers=6,
                  n_heads=8,
                  max_len=2048,
                  dropout=0.1,
                  dropout_shared_axes=None,
                  mode='train',
                  ff_activation=tl.Relu):
    """Returns a Transformer language model.

  This model performs autoregressive language modeling:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 3 tensor representing a batch of log-probability
      distributions for each sequence position over possible token IDs;
      shape is (batch_size, sequence_length, `vocab_size`).

  This model uses only the decoder part of the overall Transformer.

  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor
        should be an integer in `range(vocab_size)`. These integers typically
        represent token IDs from a vocabulary-based tokenizer.
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
        block.
    n_layers: Number of encoder blocks. Each block includes attention, dropout,
        residual, feed-forward (`Dense`), and activation layers.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'predict'`, use fast inference. If `'train'`, each encoder block
        will include dropout; else, it will pass all values through unaltered.
    ff_activation: Type of activation function at the end of each encoder
        block; must be an activation-type subclass of `Layer`.

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        tl.PositionalEncoding(max_len=max_len, mode=mode)
    ]

    decoder_blocks = [
        # pylint: disable=g-complex-comprehension
        _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation) for i in range(n_layers)
    ]

    # Assemble and return the model.
    return tl.Serial(  # tokens (or chunked tuple of tokens)
        tl.ShiftRight(mode=mode),  # toks
        positional_encoder,  # vecs
        decoder_blocks,  # vecs
        tl.LayerNorm(),  # vecs
        tl.Dense(vocab_size),  # vecs
    )
Esempio n. 23
0
 def test_new_weights(self):
     layer = tl.Dropout(rate=0.1, mode='train')
     layer.init(None)
     self.assertEmpty(layer.weights)
Esempio n. 24
0
def TransformerEncoder(vocab_size,
                       n_classes=10,
                       d_model=512,
                       d_ff=2048,
                       n_layers=6,
                       n_heads=8,
                       max_len=2048,
                       dropout=0.1,
                       dropout_shared_axes=None,
                       mode='train',
                       ff_activation=tl.Relu):
    """Returns a Transformer encoder merged with an N-way categorization head.

  This model performs text categorization:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 2 tensor representing a batch of log-probability
      distributions over N categories; shape is (batch_size, `n_classes`).

  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor
        should be an integer in `range(vocab_size)`. These integers typically
        represent token IDs from a vocabulary-based tokenizer.
    n_classes: Final dimension of the output tensors, representing N-way
        classification.
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
        block.
    n_layers: Number of encoder blocks. Each block includes attention, dropout,
        residual, feed-forward (`Dense`), and activation layers.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each encoder block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each encoder
        block; must be an activation-type subclass of `Layer`.

  Returns:
    A Transformer model that maps strings (conveyed via token IDs) to
    probability-like activations over a range of output classes.
  """
    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        tl.PositionalEncoding(max_len=max_len)
    ]

    encoder_blocks = [
        _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation) for i in range(n_layers)
    ]

    # Assemble and return the model.
    return tl.Serial(  # toks
        # Encode.
        tl.Branch(positional_encoder, tl.PaddingMask()),  # vecs masks
        encoder_blocks,  # vecs masks
        tl.Select([0], n_in=2),  # vecs
        tl.LayerNorm(),  # vecs

        # Map to output categories.
        tl.Mean(axis=1),  # vecs
        tl.Dense(n_classes),  # vecs
    )
Esempio n. 25
0
def ConfigurableTransformerEncoder(vocab_size,
                                   n_classes=10,
                                   d_model=512,
                                   d_ff=2048,
                                   n_layers=6,
                                   n_heads=8,
                                   max_len=2048,
                                   dropout=0.1,
                                   dropout_shared_axes=None,
                                   mode='train',
                                   ff_activation=tl.Relu,
                                   ff_dropout=0.1,
                                   ff_chunk_size=0,
                                   ff_use_sru=0,
                                   ff_sparsity=0,
                                   ff_sparsity_type='1inN',
                                   attention_chunk_size=0,
                                   attention_type=tl.Attention,
                                   pos_type=None,
                                   pos_axial_shape=None,
                                   pos_d_axial_embs=None):
    """Returns a Transformer encoder merged with an N-way categorization head.

  This model performs text categorization:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 2 tensor representing a batch of log-probability
      distributions over N categories; shape is (batch_size, `n_classes`).

  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor should
      be an integer in `range(vocab_size)`. These integers typically represent
      token IDs from a vocabulary-based tokenizer.
    n_classes: Final dimension of the output tensors, representing N-way
      classification.
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
      block.
    n_layers: Number of encoder blocks. Each block includes attention, dropout,
      residual, feed-forward (`Dense`), and activation layers.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    mode: If `'train'`, each encoder block will include dropout; else, it will
      pass all values through unaltered.
    ff_activation: Type of activation function at the end of each encoder block;
      must be an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers
      in addition to the feed-forward block (second int specifies sru size)
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`
    attention_chunk_size: int, if > 0 run attention chunked at this size
    attention_type: The attention layer to use for the encoder part.
    pos_type: string, the type of positional embeddings to use.
    pos_axial_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    pos_d_axial_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match pos_axial_shape, and values must sum to d_model.

  Returns:
    A Transformer model that maps strings (conveyed via token IDs) to
    probability-like activations over a range of output classes.
  """
    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        PositionalEncoder(mode, dropout, max_len, pos_type, pos_axial_shape,
                          pos_d_axial_embs)
    ]

    # pylint: disable=g-complex-comprehension
    encoder_blocks = [
        EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                     mode, ff_activation, ff_dropout, ff_chunk_size,
                     ff_use_sru, ff_sparsity, ff_sparsity_type,
                     attention_chunk_size, attention_type)
        for i in range(n_layers)
    ]
    # pylint: enable=g-complex-comprehension

    # Assemble and return the model.
    return tl.Serial(  # toks
        # Encode.
        tl.Branch(positional_encoder, tl.PaddingMask()),  # vecs masks
        encoder_blocks,  # vecs masks
        tl.Select([0], n_in=2),  # vecs
        tl.LayerNorm(),  # vecs

        # Map to output categories.
        tl.Mean(axis=1),  # vecs
        tl.Dense(n_classes),  # vecs
    )
Esempio n. 26
0
 def PositionalEncoder(vocab_size):  # tokens --> vectors
   return [
       tl.Embedding(d_model, vocab_size),
       tl.Dropout(rate=dropout, mode=mode),
       tl.PositionalEncoding(max_len=max_len),
   ]
Esempio n. 27
0
def ConfigurableTransformerLM(vocab_size,
                              d_model=512,
                              d_ff=2048,
                              n_layers=6,
                              n_heads=8,
                              max_len=2048,
                              dropout=0.1,
                              dropout_shared_axes=None,
                              mode='train',
                              ff_activation=tl.Relu,
                              ff_dropout=0.1,
                              ff_chunk_size=0,
                              ff_use_sru=0,
                              ff_sparsity=0,
                              ff_sparsity_type='1inN',
                              loss_sparsity_type='mult',
                              loss_sparsity=0,
                              loss_d_lowrank=0,
                              loss_sparsity_prob=None,
                              attention_chunk_size=0,
                              attention_type=tl.CausalAttention,
                              pos_type=None,
                              pos_axial_shape=None,
                              pos_d_axial_embs=None):
    """Returns a Transformer language model.

  This model performs autoregressive language modeling:

    - input: rank 2 tensor representing a batch of text strings via token IDs
      plus padding markers; shape is (batch_size, sequence_length). The tensor
      elements are integers in `range(vocab_size)`, and `0` values mark padding
      positions.

    - output: rank 3 tensor representing a batch of log-probability
      distributions for each sequence position over possible token IDs;
      shape is (batch_size, sequence_length, `vocab_size`).

  This model uses only the decoder part of the overall Transformer.

  Args:
    vocab_size: Input vocabulary size -- each element of the input tensor should
      be an integer in `range(vocab_size)`. These integers typically represent
      token IDs from a vocabulary-based tokenizer.
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each encoder
      block.
    n_layers: Number of encoder blocks. Each block includes attention, dropout,
      residual, feed-forward (`Dense`), and activation layers.
    n_heads: Number of attention heads.
    max_len: Maximum symbol length for positional encoding.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within an encoder block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    mode: If `'predict'`, use fast inference. If `'train'`, each encoder block
      will include dropout; else, it will pass all values through unaltered.
    ff_activation: Type of activation function at the end of each encoder block;
      must be an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers
      in addition to the feed-forward block (second int specifies sru size)
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`
    loss_sparsity_type: string, type of sparsity to used in loss layer. See
      SparseDenseWithOptions for options. None if no sparsity should be used.
    loss_sparsity: int, the sparsity for loss layer (if used)
    loss_d_lowrank: int, the dimensions for intermediate layer (if used)
    loss_sparsity_prob: float, the probability for sparse version of loss to be
      used. If None, only sparse version is used.
    attention_chunk_size: int, if > 0 run attention chunked at this size
    attention_type: The attention layer to use for the decoder part.
    pos_type: string, the type of positional embeddings to use.
    pos_axial_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    pos_d_axial_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match pos_axial_shape, and values must sum to d_model.

  Returns:
    A Transformer language model as a layer that maps from a tensor of tokens
    to activations over a vocab set.
  """
    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        PositionalEncoder(mode, dropout, max_len, pos_type, pos_axial_shape,
                          pos_d_axial_embs)
    ]

    # pylint: disable=g-complex-comprehension
    decoder_blocks = [
        DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                     mode, ff_activation, ff_dropout, ff_chunk_size,
                     ff_use_sru, ff_sparsity, ff_sparsity_type,
                     attention_chunk_size, attention_type)
        for i in range(n_layers)
    ]
    # pylint: enable=g-complex-comprehension

    # Assemble and return the model.
    return tl.Serial(  # tokens (or chunked tuple of tokens)
        tl.ShiftRight(mode=mode),  # toks
        positional_encoder,  # vecs
        decoder_blocks,  # vecs
        tl.LayerNorm(),  # vecs
        tl.SparseDenseWithOptions(  # vecs
            vocab_size,
            d_input=d_model,
            sparsity_type=loss_sparsity_type,
            sparsity=loss_sparsity,
            d_lowrank=loss_d_lowrank,
            prob_sparse=loss_sparsity_prob,
            mode=mode),
    )
Esempio n. 28
0
 def _Dropout():
   return tl.Dropout(rate=dropout, mode=mode)
Esempio n. 29
0
 def _Dropout():
     return tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
Esempio n. 30
0
 def model_fn(mode='train'):
     return tl.Serial(
         tl.Dropout(mode=mode, rate=0.1),
         tl.BatchNorm(mode=mode),
         models.MLP(layer_widths=(16, 16, n_classes),
                    mode=mode))