コード例 #1
0
ファイル: combinators_test.py プロジェクト: ajiangcn/trax
 def test_2axes(self):
     layer = tl.BatchLeadingAxes(self._Id3Dim(), n_last_axes_to_keep=2)
     ys = layer(np.zeros((3, 4, 5)))
     self.assertEqual(ys.shape, (3, 4, 5))
     ys = layer(np.zeros((2, 3, 4, 5)))
     self.assertEqual(ys.shape, (2, 3, 4, 5))
     ys = layer(np.zeros((1, 2, 3, 4, 5)))
     self.assertEqual(ys.shape, (1, 2, 3, 4, 5))
コード例 #2
0
def ChunkedFeedForward(d_model, d_ff, dropout, activation, act_dropout,
                       chunk_size, use_bfloat16, mode):
  """Chunked feed-forward block with layer normalization at start."""
  ff = FeedForward(d_model, d_ff, dropout, activation, act_dropout,
                   use_bfloat16, mode)
  if chunk_size < 1:
    return ff
  return tl.BatchLeadingAxes(tl.Chunk(tl.Serial(ff), chunk_size))
コード例 #3
0
ファイル: rl.py プロジェクト: zhaoqiuye/trax
def Value(
    body=None,
    normalizer=None,
    inject_actions=False,
    inject_actions_n_layers=1,
    inject_actions_dim=64,
    batch_axes=None,
    mode='train',
    is_discrete=False,
    vocab_size=2
):
  """Attaches a value head to a model body."""
  if body is None:
    body = lambda mode: []
  if normalizer is None:
    normalizer = lambda mode: []
  if batch_axes is None:
    batch = lambda x: x
  else:
    batch = lambda x: tl.BatchLeadingAxes(x, n_last_axes_to_keep=batch_axes)

  def ActionInjector(mode):
    if inject_actions:
      if is_discrete:
        encode_layer = tl.Parallel(
            tl.Dense(inject_actions_dim),
            tl.Embedding(inject_actions_dim, vocab_size=vocab_size))
      else:
        encode_layer = tl.Parallel(
            tl.Dense(inject_actions_dim),
            tl.Dense(inject_actions_dim),
        )
      return tl.Serial(
          # Input: (body output, actions).
          encode_layer,
          tl.Add(),
          models.PureMLP(
              layer_widths=(inject_actions_dim,) * inject_actions_n_layers,
              out_activation=True,
              flatten=False,
              mode=mode,
          )
      )
    else:
      return []

  return tl.Serial(
      batch(normalizer(mode=mode)),
      batch(body(mode=mode)),
      ActionInjector(mode=mode),
      tl.Dense(1),
  )
コード例 #4
0
def FeedForwardWithOptions(d_model,
                           d_ff,
                           dropout,
                           dropout_shared_axes,
                           ff_activation,
                           ff_dropout,
                           ff_chunk_size,
                           ff_use_sru,
                           ff_sparsity,
                           mode,
                           use_bfloat16=False,
                           ff_sparsity_type='1inN'):
    """Feed-Forward block with all the options.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers
      in addition to the feed-forward block (second int specifies sru size)
    ff_sparsity: int, tuple or string; if not 0, use sparse feed-forward block
      with this sparsity
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    use_bfloat16: whether to use bfloat16 for weights (default: False).
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`
      use SwitchSparseFF if ff_sparsity_type=`'Switch'`

  Returns:
    A list of layers which maps vectors to vectors.
  """
    if ff_sparsity and ff_sparsity_type == '1inN':
        temperature, quant_prob = 0.1, 0.3
        if isinstance(ff_sparsity, str):
            # This is hacky but used to pass ff_sparsity in yaml sweep files.
            ff_sparsity = [(float(x) if '.' in x else int(x))
                           for x in ff_sparsity.split()]
        if isinstance(ff_sparsity, (list, tuple)):
            if len(ff_sparsity) == 2:
                n_elements_in_block, d_lowrank = ff_sparsity
            else:
                n_elements_in_block, d_lowrank, temperature, quant_prob = ff_sparsity
        else:
            assert isinstance(ff_sparsity, int)
            n_elements_in_block, d_lowrank = ff_sparsity, d_ff // ff_sparsity
        ff = tl.SparseFF(d_ff,
                         n_elements_in_block=n_elements_in_block,
                         d_lowrank=d_lowrank,
                         temperature=temperature,
                         quant_prob=quant_prob,
                         use_bfloat16=use_bfloat16,
                         mode=mode,
                         dropout_rate=dropout,
                         dropout_shared_axes=dropout_shared_axes,
                         ff_chunk_size=ff_chunk_size)
    elif ff_sparsity and ff_sparsity_type == 'Block':
        ff = tl.BlockSparseFF(d_ff, n_experts=ff_sparsity, mode=mode)
    elif ff_sparsity and ff_sparsity_type == 'Switch':
        ff = tl.SwitchSparseFF(d_ff, n_experts=ff_sparsity, mode=mode)
    else:
        ff = _FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout,
                          use_bfloat16, mode)
    res = [tl.LayerNorm(), ff]
    if ff_sparsity_type != '1inN' or ff_sparsity == 0:
        # SparseFF has Dropout and BatchLeadingAxes built-in.
        res.append(
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode))
        if ff_chunk_size > 0:
            res = tl.BatchLeadingAxes(tl.Chunk(tl.Serial(res), ff_chunk_size))
    if ff_use_sru:
        if isinstance(ff_use_sru, (list, tuple)):
            sru_n_layers, sru_n_units = ff_use_sru
        else:
            sru_n_layers, sru_n_units = ff_use_sru, 32
        sru = [tl.SRU(sru_n_units, mode=mode) for _ in range(sru_n_layers)]
        block = [tl.LayerNorm(), tl.Dense(sru_n_units)
                 ] + sru + [tl.Dense(d_model)]
        res = tl.Residual(block, shortcut=res)
    return [res]
コード例 #5
0
def FeedForwardWithOptions(d_model,
                           d_ff,
                           dropout,
                           dropout_shared_axes,
                           ff_activation,
                           ff_dropout,
                           ff_chunk_size,
                           ff_use_sru,
                           ff_sparsity,
                           mode,
                           ff_sparsity_type='1inN'):
    """Feed-Forward block with all the options.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
      the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    dropout: Stochastic rate (probability) for dropping an activation value when
      applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing
      along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful
      way to save memory and apply consistent masks to activation vectors at
      different sequence positions.
    ff_activation: Type of activation function at the end of each block; must be
      an activation-type subclass of `Layer`.
    ff_dropout: Stochastic rate (probability) for dropping an activation value
      when applying dropout after the FF dense layer.
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity
    mode: If `'train'`, each block will include dropout; else, it will pass all
      values through unaltered.
    ff_sparsity_type: string, if ff_sparsity >0,
      use SparseFF if ff_sparsity_type=`'1inN'` and
      use BlockSparseFF if ff_sparsity_type=`'Block'`

  Returns:
    A list of layers which maps vectors to vectors.
  """
    if ff_use_sru:
        return [tl.SRU(d_model) for _ in range(ff_use_sru)]
    elif ff_sparsity and ff_sparsity_type == '1inN':
        ff = tl.SparseFF(d_ff,
                         n_elements_in_block=ff_sparsity,
                         d_lowrank=d_ff // ff_sparsity,
                         mode=mode)
        if ff_chunk_size < 1:
            chunked_ff = ff
        else:
            chunked_ff = tl.BatchLeadingAxes(
                tl.Chunk(tl.Serial(ff), ff_chunk_size))
        return [
            tl.LayerNorm(), chunked_ff,
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
        ]
    elif ff_sparsity and ff_sparsity_type == 'Block':
        return [
            tl.LayerNorm(),
            tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode),
            tl.Dropout(rate=dropout,
                       shared_axes=dropout_shared_axes,
                       mode=mode)
        ]
    else:
        return [
            ChunkedFeedForward(d_model, d_ff, dropout, ff_activation,
                               ff_dropout, ff_chunk_size, mode)
        ]