Python ReversibleSerial Examples

Programming Language: Python

Namespace/Package Name: tensor2tensor.trax.layers

Method/Function: ReversibleSerial

Examples at hotexamples.com: 2

Python ReversibleSerial - 2 examples found. These are the top rated real world Python examples of tensor2tensor.trax.layers.ReversibleSerial extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def TransformerRevnetLM(vocab_size,
                        d_model=512,
                        d_ff=2048,
                        d_attention_key=64,
                        d_attention_value=64,
                        n_layers=6,
                        n_heads=8,
                        dropout=0.1,
                        max_len=2048,
                        n_chunks=32,
                        n_attention_chunks=8,
                        attention_type=DotProductAttention,
                        mode='train'):
  """Reversible transformer language model (only uses a decoder, no encoder).

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    n_chunks: int: number of chunks (must match input pipeline)
    n_attention_chunks: int: number of chunks for attention
    attention_type: class: attention class to use, such as DotProductAttention.
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
  positional_embedder = [
      tl.Embedding(d_model, vocab_size),
      # TODO(kitaev): add dropout
      tl.PositionalEncoding(max_len=max_len),
  ]
  return tl.Model(
      tl.Concatenate(n_items=n_chunks),
      tl.ShiftRight(),
      positional_embedder,
      tl.Dup(),
      tl.ReversibleSerial([
          # pylint: disable=g-complex-comprehension
          DecoderBlock(d_model, d_ff,
                       d_attention_key, d_attention_value, n_heads,
                       n_attention_chunks, attention_type,
                       dropout, mode)
          for _ in range(n_layers)
      ]),
      tl.Parallel(tl.LayerNorm(), tl.LayerNorm()),
      tl.Concatenate(),
      Split(n_sections=n_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
      Map([
          tl.Dense(vocab_size),
          tl.LogSoftmax(),
      ], n_sections=n_chunks),
  )

Example #2

Show file

def ReformerLM(vocab_size,
               d_model=512,
               d_ff=2048,
               d_attention_key=64,
               d_attention_value=64,
               n_layers=6,
               n_heads=8,
               dropout=0.1,
               max_len=2048,
               n_chunks=32,
               n_attention_chunks=8,
               attention_type=tl.DotProductCausalAttention,
               share_qk=False,
               mode='train'):
    """Reversible transformer language model (only uses a decoder, no encoder).

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    n_chunks: int: number of chunks (must match input pipeline)
    n_attention_chunks: int: number of chunks for attention
    attention_type: class: attention class to use, such as DotProductAttention.
    share_qk: bool, whether to share queries and keys.
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    positional_embedder = [
        tl.Embedding(d_model, vocab_size),
        BroadcastedDropout(rate=dropout, mode=mode),  # pylint: disable=no-value-for-parameter
        tl.PositionalEncoding(max_len=max_len),
    ]
    return tl.Model(
        tl.Concatenate(n_items=n_chunks),
        tl.ShiftRight(),
        positional_embedder,
        tl.Dup(),
        tl.ReversibleSerial([
            # pylint: disable=g-complex-comprehension
            DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value,
                         n_heads, n_attention_chunks, attention_type, dropout,
                         share_qk, mode) for _ in range(n_layers)
        ] + [
            SplitForOutput(n_sections=n_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
        ]),
        Map(
            [
                # TODO(kitaev): Test whether dropout should go before or after the
                # LayerNorm, and whether dropout broadcasting is needed here.
                tl.LayerNorm(),
                BroadcastedDropout(rate=dropout, mode=mode),  # pylint: disable=no-value-for-parameter
                tl.Dense(vocab_size),
                tl.LogSoftmax(),
            ],
            n_sections=n_chunks),
    )