Exemple #1
0
def FeedForwardWithOptions(d_model, d_ff, dropout, ff_activation, ff_dropout,
                           ff_chunk_size, ff_use_sru, ff_sparsity, mode):
  """Feed-Forward block with all the options."""
  if ff_use_sru:
    return [tl.SRU(d_model) for _ in range(ff_use_sru)]
  elif ff_sparsity:
    return [tl.LayerNorm(),
            tl.SparseFF(d_ff, n_elements_in_block=ff_sparsity,
                        d_lowrank=d_ff // ff_sparsity, mode=mode),
            tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode)]
  else:
    return [ChunkedFeedForward(d_model, d_ff, dropout, ff_activation,
                               ff_dropout, ff_chunk_size, mode)]
def FeedForwardWithOptions(d_model,
                           d_ff,
                           dropout,
                           dropout_shared_axes,
                           ff_activation,
                           ff_dropout,
                           ff_chunk_size,
                           ff_use_sru,
                           ff_sparsity,
                           mode,
                           use_bfloat16=False,
                           ff_sparsity_type='1inN'):
    """Feed-Forward block with all the options.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers
      in addition to the feed-forward block (second int specifies sru size)
    ff_sparsity: int, tuple or string; if not 0, use sparse feed-forward block
      with this sparsity
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    use_bfloat16: whether to use bfloat16 for weights (default: False).
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`
      use SwitchSparseFF if ff_sparsity_type=`'Switch'`

  Returns:
    A list of layers which maps vectors to vectors.
  """
    if ff_sparsity and ff_sparsity_type == '1inN':
        temperature, quant_prob = 0.1, 0.3
        if isinstance(ff_sparsity, str):
            # This is hacky but used to pass ff_sparsity in yaml sweep files.
            ff_sparsity = [(float(x) if '.' in x else int(x))
                           for x in ff_sparsity.split()]
        if isinstance(ff_sparsity, (list, tuple)):
            if len(ff_sparsity) == 2:
                n_elements_in_block, d_lowrank = ff_sparsity
            else:
                n_elements_in_block, d_lowrank, temperature, quant_prob = ff_sparsity
        else:
            assert isinstance(ff_sparsity, int)
            n_elements_in_block, d_lowrank = ff_sparsity, d_ff // ff_sparsity
        ff = tl.SparseFF(d_ff,
                         n_elements_in_block=n_elements_in_block,
                         d_lowrank=d_lowrank,
                         temperature=temperature,
                         quant_prob=quant_prob,
                         use_bfloat16=use_bfloat16,
                         mode=mode,
                         dropout_rate=dropout,
                         dropout_shared_axes=dropout_shared_axes,
                         ff_chunk_size=ff_chunk_size)
    elif ff_sparsity and ff_sparsity_type == 'Block':
        ff = tl.BlockSparseFF(d_ff, n_experts=ff_sparsity, mode=mode)
    elif ff_sparsity and ff_sparsity_type == 'Switch':
        ff = tl.SwitchSparseFF(d_ff, n_experts=ff_sparsity, mode=mode)
    else:
        ff = _FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout,
                          use_bfloat16, mode)
    res = [tl.LayerNorm(), ff]
    if ff_sparsity_type != '1inN' or ff_sparsity == 0:
        # SparseFF has Dropout and BatchLeadingAxes built-in.
        res.append(
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode))
        if ff_chunk_size > 0:
            res = tl.BatchLeadingAxes(tl.Chunk(tl.Serial(res), ff_chunk_size))
    if ff_use_sru:
        if isinstance(ff_use_sru, (list, tuple)):
            sru_n_layers, sru_n_units = ff_use_sru
        else:
            sru_n_layers, sru_n_units = ff_use_sru, 32
        sru = [tl.SRU(sru_n_units, mode=mode) for _ in range(sru_n_layers)]
        block = [tl.LayerNorm(), tl.Dense(sru_n_units)
                 ] + sru + [tl.Dense(d_model)]
        res = tl.Residual(block, shortcut=res)
    return [res]
def FeedForwardWithOptions(d_model,
                           d_ff,
                           dropout,
                           dropout_shared_axes,
                           ff_activation,
                           ff_dropout,
                           ff_chunk_size,
                           ff_use_sru,
                           ff_sparsity,
                           mode,
                           ff_sparsity_type='1inN'):
    """Feed-Forward block with all the options.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`

  Returns:
    A list of layers which maps vectors to vectors.
  """
    if ff_use_sru:
        return [tl.SRU(d_model) for _ in range(ff_use_sru)]
    elif ff_sparsity and ff_sparsity_type == '1inN':
        ff = tl.SparseFF(d_ff,
                         n_elements_in_block=ff_sparsity,
                         d_lowrank=d_ff // ff_sparsity,
                         mode=mode)
        if ff_chunk_size < 1:
            chunked_ff = ff
        else:
            chunked_ff = tl.BatchLeadingAxes(
                tl.Chunk(tl.Serial(ff), ff_chunk_size))
        return [
            tl.LayerNorm(), chunked_ff,
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
        ]
    elif ff_sparsity and ff_sparsity_type == 'Block':
        return [
            tl.LayerNorm(),
            tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode),
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
        ]
    else:
        return [
            ChunkedFeedForward(d_model, d_ff, dropout, ff_activation,
                               ff_dropout, ff_chunk_size, mode)
        ]