Exemple #1
0
def policy_and_value_net(rng_key,
                         batch_observations_shape,
                         observations_dtype,
                         n_actions,
                         bottom_layers_fn=(),
                         two_towers=True):
  """A policy and value net function."""

  # Layers.

  # Now, with the current logits, one head computes action probabilities and the
  # other computes the value function.
  # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.

  if two_towers:
    layers = [
        tl.Dup(),
        tl.Parallel(
            [bottom_layers_fn(), tl.Dense(n_actions), tl.LogSoftmax()],
            [bottom_layers_fn(), tl.Dense(1)],
        )
    ]
  else:
    layers = [
        bottom_layers_fn(),
        tl.Dup(),
        tl.Parallel(
            [tl.Dense(n_actions), tl.LogSoftmax()],
            [tl.Dense(1)],
        )
    ]
  net = tl.Model(layers)
  params, state = net.initialize(batch_observations_shape, observations_dtype,
                                 rng_key)
  return params, state, net
def DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value, n_heads,
                 n_attention_chunks, attention_type, dropout, mode):
    """Reversible transformer decoder layer.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_heads: int: number of attention heads
    n_attention_chunks: int: number of chunks for attention
    attention_type: class: attention class to use, such as DotProductAttention.
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """

    pre_attention = [
        Chunk(n_sections=n_attention_chunks),  # pylint: disable=no-value-for-parameter
        tl.LayerNorm(),
        tl.Dup(),
        tl.Dup(),
        tl.Parallel(
            [
                tl.ComputeAttentionHeads(n_heads=n_heads,
                                         d_head=d_attention_key)
            ],
            [
                tl.ComputeAttentionHeads(n_heads=n_heads,
                                         d_head=d_attention_key)
            ],
            [
                tl.ComputeAttentionHeads(n_heads=n_heads,
                                         d_head=d_attention_value)
            ],
        ),
    ]

    attention = attention_type(mode=mode)

    # ReversibleAttentionHalfResidual requires that post_attention be linear in
    # its input (so the backward pass can be computed without knowing the input)
    post_attention = [
        tl.ComputeAttentionOutput(n_heads=n_heads, d_model=d_model),
        Unchunk(n_sections=n_attention_chunks),  # pylint: disable=no-value-for-parameter
        BroadcastedDropout(rate=dropout, mode=mode),  # pylint: disable=no-value-for-parameter
    ]

    feed_forward = [
        FeedForward(d_model, d_ff, dropout, mode=mode),
    ]
    return [
        ReversibleAttentionHalfResidual(pre_attention, attention,
                                        post_attention),
        tl.ReversibleSwap(),
        ReversibleHalfResidual(feed_forward),
        tl.ReversibleSwap(),
    ]
Exemple #3
0
def policy_and_value_net(n_actions, bottom_layers_fn, two_towers):
  """A policy and value net function."""

  # Layers.

  # Now, with the current logits, one head computes action probabilities and the
  # other computes the value function.
  # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.

  if two_towers:
    layers = [
        tl.Dup(),
        tl.Parallel(
            [bottom_layers_fn(),
             tl.Dense(n_actions),
             tl.LogSoftmax()],
            [bottom_layers_fn(), tl.Dense(1)],
        )
    ]
  else:
    layers = [
        bottom_layers_fn(),
        tl.Dup(),
        tl.Parallel(
            [tl.Dense(n_actions), tl.LogSoftmax()],
            [tl.Dense(1)],
        )
    ]
  return tl.Model(layers)
Exemple #4
0
def SumLearnedPick(positions):
    """Get a pair (vec, pos) and pick new pos."""
    succ_keys = positions[:-1, :]
    succ_values = positions[1:, :]
    subtract_1_keys = positions[1:, :]
    subtract_1_values = positions[:-1, :]
    l = int(positions.shape[0]) // 2
    add_keys = np.array([
        np.concatenate([positions[i, :], positions[j, :]]) for i in range(l)
        for j in range(l)
    ])
    add_values = np.array(
        [positions[i + j, :] for i in range(l) for j in range(l)])
    # TODO(lukaszkaiser): try this below: "for j in range(i) for i in range(2*l)"
    sub_keys = np.array([
        np.concatenate([positions[i, :], positions[j, :]]) for j in range(l)
        for i in range(l)
    ])
    sub_values = np.array(
        [positions[max(i - j, 0), :] for j in range(l) for i in range(l)])
    return tl.Serial(
        tl.Dup(), tl.Dup(), tl.Dup(), tl.Dup(),
        tl.Parallel(
            LearnedQP(),
            LearnedQP(keys=succ_keys, values=succ_values),
            LearnedQP(keys=subtract_1_keys, values=subtract_1_values),
            LearnedQP(keys=add_keys, values=add_values, binary=True),
            LearnedQP(keys=sub_keys, values=sub_values, binary=True),
        ), Unnest(), SoftmaxBranches(n_branches=5))
Exemple #5
0
def AtariCnn(hidden_sizes=(32, 32), output_size=128, mode='train'):
    """An Atari CNN."""
    del mode

    # TODO(jonni): Include link to paper?
    # Input shape: (B, T, H, W, C)
    # Output shape: (B, T, output_size)
    return tl.Model(
        tl.ToFloat(),
        tl.Div(divisor=255.0),

        # Set up 4 successive game frames, concatenated on the last axis.
        tl.Dup(),
        tl.Dup(),
        tl.Dup(),
        tl.Parallel(None, _shift_right(1), _shift_right(2), _shift_right(3)),
        tl.Concatenate(n_items=4, axis=-1),  # (B, T, H, W, 4C)
        tl.Conv(hidden_sizes[0], (5, 5), (2, 2), 'SAME'),
        tl.Relu(),
        tl.Conv(hidden_sizes[1], (5, 5), (2, 2), 'SAME'),
        tl.Relu(),
        tl.Flatten(n_axes_to_keep=2),  # B, T and rest.
        tl.Dense(output_size),
        tl.Relu(),
    )
Exemple #6
0
def MultiHeadedAttentionPosition(positions,
                                 d_feature,
                                 n_heads=8,
                                 dropout=0.0,
                                 mode='train'):
    """Transformer-style multi-headed attention."""
    return tl.Serial(
        tl.Dup(),
        tl.Dup(),
        tl.Parallel(
            ApplyAndQueryPositions(
                tl.Dense(d_feature),
                pos=[SumLearnedPick(positions) for _ in range(n_heads)]),
            PreservePosition(tl.Dense(d_feature)),
            PreservePosition(tl.Dense(d_feature)),
        ),
        tl.Parallel(
            CopyHeadsPos(h=n_heads),
            MixHeadsPos(h=n_heads),
            MixHeadsPos(h=n_heads),
        ),
        tl.PureMultiHeadedAttention(d_feature=d_feature,
                                    n_heads=n_heads,
                                    dropout=dropout,
                                    mode=mode),
        tl.Parallel([], tl.Drop()),  # Drop the mask.
        CombineHeadsPos(h=n_heads),
        PreservePosition(tl.Dense(d_feature)),
    )
def DecoderBlock(d_feature, d_feedforward, n_heads, n_attention_chunks,
                 attention_loop_stride, dropout, mode):
    """Reversible transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    n_attention_chunks: int: number of chunks for attention
    attention_loop_stride: int: number of query elements to compute attention
      for in parallel. Set to 0 to disable memory-efficient attention.
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """

    pre_attention = [
        Chunk(sections=n_attention_chunks),  # pylint: disable=no-value-for-parameter
        tl.LayerNorm(),
        tl.Dup(),
        tl.Dup(),
        tl.Parallel(
            [tl.Dense(d_feature),
             SplitHeads(n_heads=n_heads)],  # pylint: disable=no-value-for-parameter
            [tl.Dense(d_feature),
             SplitHeads(n_heads=n_heads)],  # pylint: disable=no-value-for-parameter
            [tl.Dense(d_feature),
             SplitHeads(n_heads=n_heads)],  # pylint: disable=no-value-for-parameter
        ),
    ]

    # TODO(kitaev): add dropout
    if attention_loop_stride < 1:
        # Use the standard implementation if no loop_stride is provided.
        attention = DotProductAttention(dropout=None, mode=mode)
    else:
        attention = MemoryEfficientDotProductAttention(
            loop_stride=attention_loop_stride, dropout=None, mode=mode)

    # ReversibleAttentionHalfResidual requires that post_attention be linear in
    # its input (so the backward pass can be computed without knowing the input)
    post_attention = [
        JoinHeads(),  # pylint: disable=no-value-for-parameter
        tl.Dense(d_feature),
        Unchunk(sections=n_attention_chunks),  # pylint: disable=no-value-for-parameter
    ]

    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        ReversibleAttentionHalfResidual(pre_attention, attention,
                                        post_attention),
        ReversibleSwap(),
        ReversibleHalfResidual(feed_forward),
        ReversibleSwap(),
    ]
def policy_and_value_net(n_actions, n_controls, vocab_size, bottom_layers_fn,
                         two_towers):
    """A policy and value net function."""

    # Layers.

    # Now, with the current logits, one head computes action probabilities and the
    # other computes the value function.
    # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.

    @tl.layer()
    def FlattenControlsIntoTime(x, **unused_kwargs):  # pylint: disable=invalid-name
        """Splits logits for actions in different controls and flattens controls."""
        return np.reshape(x, (x.shape[0], -1, n_actions))

    if vocab_size is None:
        # In continuous policies every element of the output sequence corresponds to
        # an observation.
        n_preds_per_input = n_controls
        kwargs = {}
    else:
        # In discrete policies every element of the output sequence corresponds to
        # a symbol in the discrete representation, and each control takes 1 symbol.
        n_preds_per_input = 1
        kwargs = {"vocab_size": vocab_size}

    if two_towers:
        layers = [
            tl.Dup(),
            tl.Parallel(
                [
                    bottom_layers_fn(**kwargs),
                    tl.Dense(n_preds_per_input * n_actions),
                    FlattenControlsIntoTime(),  # pylint: disable=no-value-for-parameter
                    tl.LogSoftmax()
                ],
                [
                    bottom_layers_fn(**kwargs),
                    tl.Dense(n_preds_per_input),
                    tl.Flatten()
                ],
            )
        ]
    else:
        layers = [
            bottom_layers_fn(**kwargs),
            tl.Dup(),
            tl.Parallel(
                [
                    tl.Dense(n_preds_per_input * n_actions),
                    FlattenControlsIntoTime(),  # pylint: disable=no-value-for-parameter
                    tl.LogSoftmax()
                ],
                [tl.Dense(n_preds_per_input),
                 tl.Flatten()],
            )
        ]
    return tl.Model(layers)
def EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer encoder-decoder layer.

  The input is a triple (decoder_input, mask, encoder) where the mask is
  created from the original source to prevent attending to the padding part
  of the encoder.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    decoder_self_attention = [  #        vecs_d   pmask vecs_e
        tl.LayerNorm(),  #        vecs_d   ..... ......
        tl.Dup(),  # vecs_d vecs_d   ..... ......
        tl.Parallel([],
                    tl.CausalMask(axis=-2)),  # ______ masks    ..... ......
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Parallel([], tl.Drop()),  # ______   0      ..... ......
        tl.Dropout(rate=dropout, mode=mode),  # vecs_d          ..... ......
    ]
    decoder_to_encoder_attention = [  # vecs_d        masks         vecs_e
        tl.Parallel([], [], tl.Dup()),  # ______        _____  vecs_e vecs_e
        tl.Parallel([], tl.Swap()),  # ______        vecs_e masks  ......
        tl.Parallel([], tl.Dup()),  # ______ vecs_e vecs_e .....  ......
        tl.MultiHeadedAttentionQKV(  # (q k v masks ... --> vecs_d masks ...)
            d_feature,
            n_heads=n_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),  # vecs_d mask vecs_e
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [  # vecs_d masks vecs_e
        tl.Residual(decoder_self_attention),  # vecs_d masks vecs_e
        tl.Residual(decoder_to_encoder_attention),  # vecs_d masks vecs_e
        tl.Residual(feed_forward),  # vecs_d masks vecs_e
    ]
Exemple #10
0
    def __init__(self, pre_attention, attention, post_attention):
        self.pre_attention = tl.Serial([
            # (x1_or_y1, x2) -> (x2, x1_or_y1, x2)
            tl.Parallel([], tl.Dup()),
            tl.Swap(),
            tl.Parallel(pre_attention, [], []),
        ])
        assert hasattr(attention, 'forward_and_backward')
        self.attention = ApplyAttentionWrapper(attention)
        self.post_attention = tl.Parallel(post_attention, [], [])

        layers = [
            self.pre_attention,
            self.attention,
            self.post_attention,
            tl.Parallel(tl.Add(), []),
        ]
        super(ReversibleAttentionHalfResidual, self).__init__(layers)

        self.subtract_top = tl.Parallel(tl.SubtractTop(), [])
        self.reverse_layers = [
            self.pre_attention,
            self.attention,
            self.post_attention,
            self.subtract_top,
        ]
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode):
    """Returns a layer sequence that implements a Transformer decoder block.

  The input to the layer sequence is an activation tensor.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    A sequence of layers that maps an activation tensor to an activation tensor.
  """
    self_attention = [
        tl.LayerNorm(),  # vec
        tl.Dup(),  # vec vec
        tl.Parallel([], tl.CausalMask(axis=-2)),  # vec mask
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Parallel([], tl.Drop()),  # vec
        tl.Dropout(rate=dropout, mode=mode),  # vec
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        tl.Residual(self_attention),
        tl.Residual(feed_forward),
    ]
Exemple #12
0
def DecoderLayer(positions, d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    positions: random vectors for positions
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    return [
        tl.Residual(  # Self-attention block.
            PreservePosition(tl.LayerNorm()),
            tl.Dup(),
            tl.Parallel(
                [],  # activation for (q, k, v)
                tl.CausalMask(axis=-2)),  # attention mask
            MultiHeadedAttentionPosition(positions,
                                         d_feature,
                                         n_heads=n_heads,
                                         dropout=dropout,
                                         mode=mode),
            PreservePosition(tl.Dropout(rate=dropout, mode=mode))),
        ResidualFeedForward(d_feature, d_feedforward, dropout, mode=mode)
    ]
Exemple #13
0
def TransformerRevnetLM(vocab_size,
                        d_feature=512,
                        d_feedforward=2048,
                        d_attention_key=64,
                        d_attention_value=64,
                        n_layers=6,
                        n_heads=8,
                        dropout=0.1,
                        max_len=2048,
                        n_chunks=32,
                        n_attention_chunks=8,
                        attention_loop_stride=0,
                        mode='train'):
    """Reversible transformer language model (only uses a decoder, no encoder).

  Args:
    vocab_size: int: vocab size
    d_feature: int:  depth of *each half* of the two-part features
    d_feedforward: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    n_chunks: int: number of chunks (must match input pipeline)
    n_attention_chunks: int: number of chunks for attention
    attention_loop_stride: int: number of query elements to compute attention
      for in parallel. Set to 0 to disable memory-efficient attention.
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    positional_embedder = [
        tl.Embedding(d_feature, vocab_size),
        # TODO(kitaev): add dropout
        tl.PositionalEncoding(max_len=max_len),
    ]
    return tl.Model(
        tl.Concatenate(n_items=n_chunks),
        tl.ShiftRight(),
        positional_embedder,
        tl.Dup(),
        ReversibleSerial([
            # pylint: disable=g-complex-comprehension
            DecoderBlock(d_feature, d_feedforward, d_attention_key,
                         d_attention_value, n_heads, n_attention_chunks,
                         attention_loop_stride, dropout, mode)
            for _ in range(n_layers)
        ]),
        tl.Parallel(tl.LayerNorm(), tl.LayerNorm()),
        tl.Concatenate(),
        Split(sections=n_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
        Map([
            tl.Dense(vocab_size),
            tl.LogSoftmax(),
        ], sections=n_chunks),
    )
Exemple #14
0
def policy_and_value_net(n_actions, n_controls, bottom_layers_fn, two_towers):
    """A policy and value net function."""

    # Layers.

    # Now, with the current logits, one head computes action probabilities and the
    # other computes the value function.
    # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.

    @tl.layer()
    def FlattenControlsIntoTime(x, **unused_kwargs):  # pylint: disable=invalid-name
        """Splits logits for actions in different controls and flattens controls."""
        return np.reshape(x, (x.shape[0], -1, n_actions))

    n_logits = n_controls * n_actions

    if two_towers:
        layers = [
            tl.Dup(),
            tl.Parallel(
                [
                    bottom_layers_fn(),
                    tl.Dense(n_logits),
                    FlattenControlsIntoTime(),  # pylint: disable=no-value-for-parameter
                    tl.LogSoftmax()
                ],
                [bottom_layers_fn(),
                 tl.Dense(n_controls),
                 tl.Flatten()],
            )
        ]
    else:
        layers = [
            bottom_layers_fn(),
            tl.Dup(),
            tl.Parallel(
                [
                    tl.Dense(n_logits),
                    FlattenControlsIntoTime(),  # pylint: disable=no-value-for-parameter
                    tl.LogSoftmax()
                ],
                [tl.Dense(n_controls), tl.Flatten()],
            )
        ]
    return tl.Model(layers)
Exemple #15
0
def TransformerRevnetLM(vocab_size,
                        d_model=512,
                        d_ff=2048,
                        d_attention_key=64,
                        d_attention_value=64,
                        n_layers=6,
                        n_heads=8,
                        dropout=0.1,
                        max_len=2048,
                        n_chunks=32,
                        n_attention_chunks=8,
                        attention_type=DotProductAttention,
                        mode='train'):
    """Reversible transformer language model (only uses a decoder, no encoder).

  Args:
    vocab_size: int: vocab size
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    n_chunks: int: number of chunks (must match input pipeline)
    n_attention_chunks: int: number of chunks for attention
    attention_type: class: attention class to use, such as DotProductAttention.
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    positional_embedder = [
        tl.Embedding(d_model, vocab_size),
        BroadcastedDropout(rate=dropout, mode=mode),  # pylint: disable=no-value-for-parameter
        tl.PositionalEncoding(max_len=max_len),
    ]
    return tl.Model(
        tl.Concatenate(n_items=n_chunks),
        tl.ShiftRight(),
        positional_embedder,
        tl.Dup(),
        tl.ReversibleSerial([
            # pylint: disable=g-complex-comprehension
            DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value,
                         n_heads, n_attention_chunks, attention_type, dropout,
                         mode) for _ in range(n_layers)
        ]),
        tl.Parallel(tl.LayerNorm(), tl.LayerNorm()),
        tl.Concatenate(),
        Split(n_sections=n_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
        Map([
            tl.Dense(vocab_size),
            tl.LogSoftmax(),
        ], n_sections=n_chunks),
    )
Exemple #16
0
def FrameStack(n_frames):
  """Stacks a fixed number of frames along the dimension 1."""
  # Input shape: (B, T, ..., C).
  # Output shape: (B, T, ..., C * n_frames).
  assert n_frames >= 1
  return (
      # Make n_frames copies of the input sequence.
      [tl.Dup()] * (n_frames - 1),
      # Shift copies to the right by [0, .., n_frames - 1] frames.
      tl.Parallel(*map(_shift_right, range(n_frames))),
      # Concatenate along the channel dimension.
      tl.Concatenate(n_items=n_frames, axis=-1),
  )
Exemple #17
0
    def __init__(self, residual_layers):
        self.compute_residual = tl.Serial([
            # (x1_or_y1, x2) -> (x2, x1_or_y1, x2)
            tl.Parallel([], tl.Dup()),
            tl.Swap(),
            tl.Parallel(residual_layers, [], []),
        ])

        layers = [self.compute_residual, tl.Parallel(tl.Add(), [])]
        super(ReversibleHalfResidual, self).__init__(layers)

        self.subtract_top = tl.Parallel(tl.SubtractTop(), [])
        self.reverse_layers = [self.compute_residual, self.subtract_top]
Exemple #18
0
def EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (decoder_input, mask, encoder) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    decoder_self_attention = [
        # TODO(jonni): Work on combinators so that this flow is cleaner/clearer.
        tl.LayerNorm(),
        tl.Dup(),
        tl.CausalMask(axis=-2),  # Create the self-attention mask.
        tl.Swap(),  # Put mask behind the activations.
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Swap(),  # Put self-attention mask on top.
        tl.Drop(),  # Drop self-attention mask.
        tl.Dropout(rate=dropout, mode=mode),
    ]
    decoder_to_encoder_attention = [
        tl.Select((0, 2, 2, 1, 2)),  # (dec, enc, enc, mask, enc-copy)
        tl.
        MultiHeadedAttentionQKV(  # (q, k, v, mask, ...) --> (new, mask, ...)
            d_feature,
            n_heads=n_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        tl.Residual(decoder_self_attention),
        tl.Residual(decoder_to_encoder_attention),
        tl.Residual(feed_forward),
    ]
 def model(mode):
   del mode
   return layers.Serial(
       layers.Parallel(
           layers.Flatten(),  # Observation stack.
           layers.Embedding(d_feature=1, vocab_size=n_actions),  # Action.
       ),
       layers.Concatenate(),
       layers.Dense(n_units=1),
       layers.Dup(),
       layers.Parallel(
           layers.Dense(n_units=obs_shape[1]),  # New observation.
           None,  # Reward.
       )
   )
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                        mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (decoder_input, mask, encoder) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (decoder_activations, mask, encoder).
  """
    # Decoder self-attending to decoder.
    self_attention = tl.Residual(
        tl.LayerNorm(),
        tl.Dup(),
        tl.CausalMask(axis=-2),  # Create the self-attention mask.
        tl.Swap(),  # Put mask behind the activations.
        tl.MultiHeadedAttention(feature_depth,
                                num_heads=num_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Swap(),  # Put self-attention mask on top.
        tl.Drop(),  # Drop self-attention mask.
        tl.Dropout(rate=dropout, mode=mode))
    # Decoder attending to encoder.
    encoder_decoder_attention = tl.Serial(
        tl.Select((0, 2, 2, 1, 2)),  # (dec, enc, enc, mask, enc-copy)
        tl.
        MultiHeadedAttentionQKV(  # (q, k, v, mask, ...) --> (new, mask, ...)
            feature_depth,
            num_heads=num_heads,
            dropout=dropout,
            mode=mode),
        tl.Dropout(rate=dropout, mode=mode),
    )
    return tl.Serial(
        self_attention, tl.Residual(encoder_decoder_attention),
        ResidualFeedForward(feature_depth,
                            feedforward_depth,
                            dropout,
                            mode=mode))
def TransformerEncoder(vocab_size,
                       n_classes=10,
                       d_model=512,
                       d_ff=2048,
                       n_layers=6,
                       n_heads=8,
                       dropout=0.1,
                       max_len=2048,
                       mode='train'):
    """Returns a Transformer encoder model.

  The input to the model is a tensor of tokens.

  Args:
    vocab_size: int: vocab size
    n_classes: how many classes on output
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    A Transformer model as a layer that maps from a tensor of tokens to
    activations over a set of output classes.
  """
    embedder = [
        tl.Embedding(d_model, vocab_size),
        tl.Dropout(rate=dropout, name='emb_dropout', mode=mode),
        tl.PositionalEncoding(max_len=max_len),
    ]
    return tl.Model([  #      tokens
        tl.Dup(),  # toks toks
        tl.Parallel(embedder, tl.PaddingMask()),  # vecs mask
        [
            EncoderBlock(d_model, d_ff, n_heads, dropout, i, mode)
            for i in range(n_layers)
        ],  # vecs mask
        tl.Parallel([], tl.Drop()),  # ____  0
        tl.LayerNorm(),  # vecs
        tl.Mean(axis=1),  # Average on length.    # vecs
        tl.Dense(n_classes),  # vecs
        tl.LogSoftmax(),  # vecs
    ])
Exemple #22
0
def DecoderBlock(d_feature, d_feedforward, n_heads, n_attention_chunks,
                 dropout, mode):
  """Reversible transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    n_attention_chunks: int: number of chunks for memory-efficient attention
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
  self_attention = [
      tl.LayerNorm(),
      tl.Dup(),
      tl.Parallel([], tl.CausalMask(axis=-2)),  # Create mask.
      tl.MultiHeadedAttention(
          d_feature, n_heads=n_heads, dropout=dropout, mode=mode),
      tl.Parallel([], tl.Drop()),  # Drop mask.
      tl.Dropout(rate=dropout, mode=mode),
  ]

  # TODO(kitaev): Memory-efficient attention. This chunking is temporary.
  self_attention = [
      Split(sections=n_attention_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
      Map(self_attention),
      tl.Concatenate(axis=-2),
  ]

  feed_forward = [
      FeedForward(d_feature, d_feedforward, dropout, mode=mode),
  ]
  return [
      ReversibleResidual([self_attention], [feed_forward]),
  ]
Exemple #23
0
def ApplyAndQueryPositions(layer, pos):
    """Execute layer without position and pos-layers on positions.

  This takes an embedding including position x = (emb, p), and
  outputs layer(emb).pos1(x, p).....layer(emb).posn(x, p)
  where pos=[pos1...posn].

  Args:
    layer: layer to be executed without position information.
    pos: list of layers to be applied to positions.

  Returns:
    the result of this application.
  """
    n_heads = len(pos)
    return tl.Serial(
        tl.Dup(),  # (x, x)
        CutAtPosition(),  # (x_content, x_position, x)
        tl.Parallel([], tl.Swap()),  # (x_content, x, x_position)
        [tl.Parallel([], Dup2()) for _ in range(n_heads - 1)],
        # Now the stack is x_content, (x, x_position) * n_heads.
        tl.Parallel(*([layer] + pos)),
        tl.Concatenate(n_items=n_heads + 1))
Exemple #24
0
def ApplyAndQueryPositions(layer, pos):
    """Execute layer without position and pos-layers on positions.

  This takes an embedding including position x = (emb, p), and
  outputs layer(emb).pos1(x, p).....layer(emb).posn(x, p)
  where pos=[pos1...posn].

  Args:
    layer: layer to be executed without position information.
    pos: list of layers to be applied to positions.

  Returns:
    the result of this application.
  """
    n_heads = len(pos)
    return tl.Serial(
        tl.Dup(),
        CutPosition(),
        # TODO(lukaszkaiser): Rewrite without using Select.
        tl.Select(tuple([0] + [(2, 1)] * n_heads)),
        tl.Parallel(*([layer] + pos)),
        Unnest(),
        ConcatenateN(n=n_heads + 1))
def Transformer(input_vocab_size,
                output_vocab_size=None,
                d_model=512,
                d_ff=2048,
                n_layers=6,
                n_heads=8,
                dropout=0.1,
                max_len=2048,
                mode='train'):
    """Returns a Transformer model.

  This model expects an input pair: target, source.

  Args:
    input_vocab_size: int: vocab size of the source.
    output_vocab_size: int (optional): vocab size of the target. If None, the
      source and target are assumed to have the same vocab.
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    A Transformer model as a layer that maps from a target, source pair to
    activations over a vocab set.
  """
    in_embed = [  # tokens
        tl.Embedding(d_model, input_vocab_size),  # vecs
        tl.Dropout(rate=dropout, mode=mode),  # vecs
        tl.PositionalEncoding(max_len=max_len),  # vecs
    ]

    if output_vocab_size is None:
        output_vocab_size = input_vocab_size
        out_embed = in_embed
    else:
        out_embed = [  # tokens
            tl.Embedding(d_model, output_vocab_size),  # vecs
            tl.Dropout(rate=dropout, mode=mode),  # vecs
            tl.PositionalEncoding(max_len=max_len),  # vecs
        ]

    encoder_stack = (  # masks vectors --> masks vectors
        [
            EncoderBlock(d_model, d_ff, n_heads, dropout, i, mode)
            for i in range(n_layers)
        ])

    encoder_decoder_stack = (  # vecs_d masks vecs_e --> vecs_d masks vecs_e
        [
            EncoderDecoder(d_model, d_ff, n_heads, dropout, i, mode)
            for i in range(n_layers)
        ])

    # Input: encoder_side_tokens, decoder_side_tokens
    return tl.Model(  # tokens_e tokens_d
        tl.Swap(),  # toks_d toks_e

        # Encode.
        tl.Parallel(  # toks_d        toks_e
            [],
            [
                tl.Dup(),  # ______ toks_e toks_e
                tl.Parallel(in_embed, tl.PaddingMask()),  # ______ vecs_e masks
                encoder_stack,  # ______ vecs_e masks
                tl.LayerNorm(),  # ______ vecs_e .....
                tl.Swap()
            ]),  # ______ masks  vecs_e

        # Decode.                                  #        toks_d masks vecs_e
        tl.ShiftRight(),  #        toks_d ..... ......
        out_embed,  #        vecs_d ..... ......
        tl.Dup(),  # vecs_d vecs_d ..... ......
        tl.Parallel([], tl.EncoderDecoderMask()),  # ______    masks     ......
        encoder_decoder_stack,  # vecs_d    masks     vecs_e
        tl.Parallel([], tl.Drop(), tl.Drop()),  # vecs_d
        tl.LayerNorm(),  # vecs_d
        tl.Dense(output_vocab_size),  # vecs_d
        tl.LogSoftmax(),  # vecs_d
    )
Exemple #26
0
def Dup2():
  """Copy first 2 elements of the stack: (a, b, ...) -> (a, b, a, b, ...)."""
  return [                              # Stack is (a, b, ...)
      tl.Parallel(tl.Dup(), tl.Dup()),  # Stack is (a, a, b, b, ...)
      tl.Parallel([], tl.Swap())        # Stack is (a, b, a, b, ...)
  ]