Beispiel #1
0
def policy_and_value_net(rng_key,
                         batch_observations_shape,
                         n_actions,
                         bottom_layers_fn=(),
                         two_towers=True):
    """A policy and value net function."""

    # Layers.

    # Now, with the current logits, one head computes action probabilities and the
    # other computes the value function.
    # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.

    if two_towers:
        net = tl.Branch(
            [bottom_layers_fn(),
             tl.Dense(n_actions),
             tl.LogSoftmax()],
            [bottom_layers_fn(), tl.Dense(1)])
    else:
        net = tl.Serial(
            bottom_layers_fn(),
            tl.Branch(
                [tl.Dense(n_actions), tl.LogSoftmax()], [tl.Dense(1)]))
    return net.initialize(batch_observations_shape, rng_key), net
Beispiel #2
0
def policy_and_value_net(rng_key,
                         batch_observations_shape,
                         num_actions,
                         bottom_layers_fn=None,
                         two_towers=True):
    """A policy and value net function."""

    # Layers.

    # Now, with the current logits, one head computes action probabilities and the
    # other computes the value function.
    # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.

    net = None
    if not two_towers:
        tower = [] if bottom_layers_fn is None else bottom_layers_fn()
        tower.extend([
            layers.Branch(
                layers.Serial(layers.Dense(num_actions), layers.LogSoftmax()),
                layers.Dense(1))
        ])
        net = layers.Serial(*tower)
    else:
        tower1 = [] if bottom_layers_fn is None else bottom_layers_fn()
        tower2 = [] if bottom_layers_fn is None else bottom_layers_fn()

        tower1.extend([layers.Dense(num_actions), layers.LogSoftmax()])
        tower2.extend([layers.Dense(1)])

        net = layers.Branch(
            layers.Serial(*tower1),
            layers.Serial(*tower2),
        )
    assert net
    return net.initialize(batch_observations_shape, rng_key), net
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                        mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (encoder, mask, decoder_input) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (encoder, mask, decoder_activations).
  """
    # Decoder self-attending to decoder.
    self_attention = layers.Residual(
        layers.LayerNorm(),
        layers.Branch(),
        layers.Parallel(
            layers.Identity(),  # activation for (q, k, v)
            layers.CausalMask(axis=-2)),  # attention mask
        layers.MultiHeadedAttention(feature_depth,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    mode=mode),
        layers.Dropout(rate=dropout, mode=mode))
    # Decoder attending to encoder.
    encoder_decoder_attention = layers.Serial(
        layers.Reorder(output=((2, 0, 0), 1)),  # ((dec, enc, enc), mask)
        layers.MultiHeadedAttentionQKV(  # ((q, k, v), mask) --> new v
            feature_depth,
            num_heads=num_heads,
            dropout=dropout,
            mode=mode),
        layers.Dropout(rate=dropout, mode=mode),
    )
    return layers.Serial(
        layers.Parallel(layers.Identity(), layers.Identity(), self_attention),
        layers.Branch(),
        layers.Parallel(layers.Identity(), encoder_decoder_attention),
        layers.UnnestBranches(),  # (encoder, mask, old_act, new_act)
        layers.Reorder(output=(0, 1, (2, 3))),
        layers.Parallel(  # Residual after encoder-decoder attention.
            layers.Identity(), layers.Identity(), layers.SumBranches()),
        layers.Parallel(  # Feed-forward on the third component (decoder).
            layers.Identity(), layers.Identity(),
            ResidualFeedForward(feature_depth,
                                feedforward_depth,
                                dropout,
                                mode=mode)))
Beispiel #4
0
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                        mode):
    """Transformer encoder-decoder layer.

  The input is a triple pair (encoder, mask, decoder_input) where
  the mask is created from the original source to prevent attending
  to the padding part of the encoder.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer, returning a triple (encoder, mask, decoder_activations).
  """
    # Decoder self-attending to decoder.
    self_attention = tl.Residual(
        tl.LayerNorm(),
        tl.Branch(tl.NoOp(), tl.CausalMask(axis=-2)),  # create mask
        tl.MultiHeadedAttention(feature_depth,
                                num_heads=num_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Select(0),  # drop mask
        tl.Dropout(rate=dropout, mode=mode))
    # Decoder attending to encoder.
    encoder_decoder_attention = tl.Serial(
        tl.Select((2, 0, 0, 1)),  # (dec, enc, enc, mask)
        tl.MultiHeadedAttentionQKV(  # (q, k, v, mask) --> new, mask
            feature_depth,
            num_heads=num_heads,
            dropout=dropout,
            mode=mode),
        tl.Select(0),  # drop the mask
        tl.Dropout(rate=dropout, mode=mode),
    )
    return tl.Serial(
        tl.Parallel(tl.NoOp(), tl.NoOp(), self_attention),
        tl.Branch(tl.NoOp(), encoder_decoder_attention),
        tl.Select(inputs=(('encoder', 'mask', 'old_act'), 'new_act'),
                  output=('encoder', 'mask', ('old_act', 'new_act'))),
        tl.Parallel(  # Residual after encoder-decoder attention.
            tl.NoOp(), tl.NoOp(), tl.Add()),
        tl.Parallel(  # Feed-forward on the third component (decoder).
            tl.NoOp(), tl.NoOp(),
            ResidualFeedForward(feature_depth,
                                feedforward_depth,
                                dropout,
                                mode=mode)))
Beispiel #5
0
def AtariCnn(hidden_sizes=(32, 32), output_size=128):
    # Input's shape = (B, T, H, W, C)
    return tl.Serial(
        tl.Div(divisor=255.0),
        # Have 4 copies of the input, each one shifted to the right by one.
        tl.Branch(
            tl.NoOp(), tl.ShiftRight(),
            tl.Serial(
                tl.ShiftRight(),
                tl.ShiftRight(),
            ), tl.Serial(
                tl.ShiftRight(),
                tl.ShiftRight(),
                tl.ShiftRight(),
            )),
        # Concatenated on the last axis.
        tl.Concatenate(axis=-1),  # (B, T, H, W, 4C)
        tl.Rebatch(tl.Conv(hidden_sizes[0], (5, 5), (2, 2), 'SAME'), 2),
        tl.Relu(),
        tl.Rebatch(tl.Conv(hidden_sizes[1], (5, 5), (2, 2), 'SAME'), 2),
        tl.Relu(),
        tl.Flatten(num_axis_to_keep=2),  # B, T and rest.
        tl.Dense(output_size),
        tl.Relu(),
        # Eventually this is shaped (B, T, output_size)
    )
Beispiel #6
0
    def Encoder(source, source_mask):
        """Transformer encoder stack.

    Args:
      source: layer variable: raw source sequences
      source_mask: layer variable: self-attention mask

    Returns:
      Layer variable that outputs encoded source.
    """
        encoder_layer = layers.Serial(
            # input attends to self
            layers.Residual(
                layers.LayerNorm(),
                layers.Branch(size=4),
                layers.Parallel(
                    layers.Identity(),  # query
                    layers.Identity(),  # key
                    layers.Identity(),  # value
                    source_mask),  # attention mask
                multi_attention,
                layers.Dropout(dropout, mode=mode)),
            # feed-forward
            ResidualFeedForward(feature_depth,
                                feedforward_depth,
                                dropout,
                                mode=mode),
        )
        return layers.Serial(
            source,
            source_embedding_layer,
            layers.repeat(encoder_layer, num_layers),
            layers.LayerNorm(),
        )
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    return layers.Serial(
        layers.Residual(  # Self-attention block.
            layers.LayerNorm(),
            layers.Branch(),
            layers.Parallel(
                layers.Identity(),  # activation for (q, k, v)
                layers.CausalMask(axis=-2)),  # attention mask
            layers.MultiHeadedAttention(feature_depth,
                                        num_heads=num_heads,
                                        dropout=dropout,
                                        mode=mode),
            layers.Dropout(rate=dropout, mode=mode)),
        ResidualFeedForward(feature_depth,
                            feedforward_depth,
                            dropout,
                            mode=mode))
Beispiel #8
0
def DecoderLayer(positions,
                 d_feature,
                 d_feedforward,
                 n_heads,
                 dropout,
                 mode):
  """Transformer decoder layer.

  Args:
    positions: random vectors for positions
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
  return [
      tl.Residual(  # Self-attention block.
          PreservePosition(tl.LayerNorm()),
          tl.Branch([],  # activation for (q, k, v)
                    tl.CausalMask(axis=-2)),  # attention mask
          MultiHeadedAttentionPosition(positions,
                                       d_feature, n_heads=n_heads,
                                       dropout=dropout, mode=mode),
          PreservePosition(tl.Dropout(rate=dropout, mode=mode))
      ),
      ResidualFeedForward(d_feature, d_feedforward, dropout, mode=mode)
  ]
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    return tl.Serial(
        tl.Residual(  # Self-attention block.
            tl.LayerNorm(),
            tl.Branch(tl.Copy(), tl.CausalMask(axis=-2)),  # Create mask.
            tl.MultiHeadedAttention(feature_depth,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    mode=mode),
            tl.Select(0),  # Drop the mask.
            tl.Dropout(rate=dropout, mode=mode)),
        ResidualFeedForward(feature_depth,
                            feedforward_depth,
                            dropout,
                            mode=mode))
Beispiel #10
0
def SumLearnedPick(positions):
  """Get a pair (vec, pos) and pick new pos."""
  succ_keys = positions[:-1, :]
  succ_values = positions[1:, :]
  subtract_1_keys = positions[1:, :]
  subtract_1_values = positions[:-1, :]
  l = int(positions.shape[0]) // 2
  add_keys = np.array([np.concatenate([positions[i, :], positions[j, :]])
                       for i in range(l) for j in range(l)])
  add_values = np.array([positions[i + j, :]
                         for i in range(l) for j in range(l)])
  # TODO(lukaszkaiser): try this below: "for j in range(i) for i in range(2*l)"
  sub_keys = np.array([np.concatenate([positions[i, :], positions[j, :]])
                       for j in range(l) for i in range(l)])
  sub_values = np.array([positions[max(i - j, 0), :]
                         for j in range(l) for i in range(l)])
  return tl.Serial(
      tl.Branch(
          LearnedQP(),
          LearnedQP(keys=succ_keys, values=succ_values),
          LearnedQP(keys=subtract_1_keys, values=subtract_1_values),
          LearnedQP(keys=add_keys, values=add_values, binary=True),
          LearnedQP(keys=sub_keys, values=sub_values, binary=True),
      ),
      Unnest(),
      SoftmaxBranches(n_branches=5)
  )
Beispiel #11
0
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode):
    """Transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    self_attention = [
        tl.LayerNorm(),
        tl.Branch([], tl.CausalMask(axis=-2)),  # Create mask.
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Select(0),  # Drop mask.
        tl.Dropout(rate=dropout, mode=mode),
    ]
    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        tl.Residual(self_attention),
        tl.Residual(feed_forward),
    ]
Beispiel #12
0
    def Decoder(memory, target, target_mask, memory_mask):
        """Transformer decoder stack.

    Args:
      memory: layer variable: encoded source sequences
      target: layer variable: raw target sequences
      target_mask: layer variable: self-attention mask
      memory_mask: layer variable: memory attention mask

    Returns:
      Layer variable that outputs encoded source.
    """
        decoder_layer = layers.Serial(
            # target attends to self
            layers.Residual(
                layers.LayerNorm(),
                layers.Branch(size=4),
                layers.Parallel(
                    layers.Identity(),  # query
                    layers.Identity(),  # key
                    layers.Identity(),  # value
                    target_mask),  # attention mask
                multi_attention,
                layers.Dropout(dropout, mode=mode)),
            # target attends to encoded source
            layers.Residual(
                layers.LayerNorm(),
                layers.Branch(size=4),
                layers.Parallel(
                    layers.Identity(),  # query
                    memory,  # key
                    memory,  # value
                    memory_mask),  # attention mask
                multi_attention,
                layers.Dropout(dropout, mode=mode)),
            # feed-forward
            ResidualFeedForward(feature_depth,
                                feedforward_depth,
                                dropout,
                                mode=mode))
        return layers.Serial(
            target,
            target_embedding_layer,
            layers.repeat(decoder_layer, num_layers),
            layers.LayerNorm(),
        )
Beispiel #13
0
def WideResnetBlock(channels, strides=(1, 1), channel_mismatch=False):
    """WideResnet convolutational block."""
    main = layers.Serial(
        layers.BatchNorm(), layers.Relu(),
        layers.Conv(channels, (3, 3), strides, padding='SAME'),
        layers.BatchNorm(), layers.Relu(),
        layers.Conv(channels, (3, 3), padding='SAME'))
    shortcut = layers.Identity() if not channel_mismatch else layers.Conv(
        channels, (3, 3), strides, padding='SAME')
    return layers.Serial(layers.Branch(), layers.Parallel(main, shortcut),
                         layers.SumBranches())
Beispiel #14
0
def Transformer(vocab_size,
                d_feature=512,
                d_feedforward=2048,
                n_layers=6,
                n_heads=8,
                dropout=0.1,
                max_len=2048,
                mode='train'):
    """Transformer.

  This model expects on input a pair (source, target).

  Args:
    vocab_size: int: vocab size (shared source and target).
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    the Transformer model.
  """
    positional_embedder = [
        tl.Embedding(d_feature, vocab_size),
        tl.Dropout(rate=dropout, mode=mode),
        tl.PositionalEncoding(max_len=max_len),
    ]
    encoder = [
        tl.Branch(positional_embedder, tl.PaddingMask()),
        [
            EncoderBlock(d_feature, d_feedforward, n_heads, dropout, mode)
            for _ in range(n_layers)
        ],
        tl.LayerNorm(),
    ]
    return tl.Model(
        tl.Parallel([], tl.ShiftRight()),
        tl.Parallel(encoder, positional_embedder),
        tl.Select(inputs=(('encoder', 'mask'), 'decoder'),
                  output=('decoder', ('mask', 'decoder'), 'encoder')),
        # (encoder_mask, decoder_input) -> encoder-decoder mask
        tl.Parallel([], tl.EncoderDecoderMask(), []),
        [
            EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode)
            for _ in range(n_layers)
        ],
        tl.Select(0),  # Drop mask and encoder.
        tl.LayerNorm(),
        tl.Dense(vocab_size),
        tl.LogSoftmax(),
    )
def ChunkedCausalMultiHeadedAttention(d_feature,
                                      n_heads=8,
                                      dropout=0.0,
                                      chunk_selector=None,
                                      mode='train'):
    """Transformer-style causal multi-headed attention operating on chunks.

  Accepts inputs that are a list of chunks and applies causal attention.

  Args:
    d_feature: int:  depth of embedding
    n_heads: int: number of attention heads
    dropout: float: dropout rate
    chunk_selector: a function from chunk number to list of chunks to attend.
    mode: str: 'train' or 'eval'

  Returns:
    Multi-headed self-attention layer.
  """
    prepare_attention_input = tl.Serial(
        tl.Branch(
            tl.Branch(  # q = k = v = first input
                tl.NoOp(), tl.NoOp(), tl.NoOp()),
            tl.CausalMask(axis=-2),
        ),
        tl.Parallel(
            tl.Parallel(
                tl.Dense(d_feature),
                tl.Dense(d_feature),
                tl.Dense(d_feature),
            ), tl.NoOp()))
    return tl.Serial(
        tl.Map(prepare_attention_input),
        ChunkedAttentionSelector(selector=chunk_selector),  # pylint: disable=no-value-for-parameter
        tl.Map(tl.PureMultiHeadedAttention(d_feature=d_feature,
                                           n_heads=n_heads,
                                           dropout=dropout,
                                           mode=mode),
               check_shapes=False),
        tl.Map(tl.Select(0), check_shapes=False),  # drop masks
        tl.Map(tl.Dense(d_feature)))
def Transformer(vocab_size,
                feature_depth=512,
                feedforward_depth=2048,
                num_layers=6,
                num_heads=8,
                dropout=0.1,
                max_len=2048,
                mode='train'):
    """Transformer.

  This model expects on input a pair (source, target).

  Args:
    vocab_size: int: vocab size (shared source and target).
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_layers: int: number of encoder/decoder layers
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    the Transformer model.
  """
    embedding = layers.Serial(layers.Embedding(feature_depth, vocab_size),
                              layers.Dropout(rate=dropout, mode=mode),
                              layers.PositionalEncoding(max_len=max_len))
    encoder = layers.Serial(
        layers.Branch(),  # Branch input to create embedding and mask.
        layers.Parallel(embedding, layers.PaddingMask()),
        layers.Serial(*[
            EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                         mode) for _ in range(num_layers)
        ]),
        layers.Parallel(layers.LayerNorm(), layers.Identity()))
    stack = [
        EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads,
                            dropout, mode) for _ in range(num_layers)
    ]
    return layers.Serial(
        layers.Parallel(layers.Identity(), layers.ShiftRight()),
        layers.Parallel(encoder, embedding),
        layers.UnnestBranches(),  # (encoder, encoder_mask, decoder_input)
        layers.Reorder(output=(0, (1, 2), 2)),
        layers.
        Parallel(  # (encoder_mask, decoder_input) -> encoder-decoder mask
            layers.Identity(), layers.EncoderDecoderMask(), layers.Identity()),
        layers.Serial(*stack),
        layers.ThirdBranch(),
        layers.LayerNorm(),
        layers.Dense(vocab_size),
        layers.LogSoftmax())
Beispiel #17
0
def IdentityBlock(kernel_size, filters):
    """ResNet identical size block."""
    ks = kernel_size
    filters1, filters2, filters3 = filters
    main = layers.Serial(layers.Conv(filters1, (1, 1)), layers.BatchNorm(),
                         layers.Relu(),
                         layers.Conv(filters2, (ks, ks), padding='SAME'),
                         layers.BatchNorm(), layers.Relu(),
                         layers.Conv(filters3, (1, 1)), layers.BatchNorm())
    return layers.Serial(layers.Branch(),
                         layers.Parallel(main, layers.Identity()),
                         layers.SumBranches(), layers.Relu())
Beispiel #18
0
def ConvBlock(kernel_size, filters, strides):
    """ResNet convolutional striding block."""
    ks = kernel_size
    filters1, filters2, filters3 = filters
    main = layers.Serial(layers.Conv(filters1, (1, 1), strides),
                         layers.BatchNorm(), layers.Relu(),
                         layers.Conv(filters2, (ks, ks), padding='SAME'),
                         layers.BatchNorm(), layers.Relu(),
                         layers.Conv(filters3, (1, 1)), layers.BatchNorm())
    shortcut = layers.Serial(layers.Conv(filters3, (1, 1), strides),
                             layers.BatchNorm())
    return layers.Serial(layers.Branch(), layers.Parallel(main, shortcut),
                         layers.SumBranches(), layers.Relu())
Beispiel #19
0
def Transformer(vocab_size,
                feature_depth=512,
                feedforward_depth=2048,
                num_layers=6,
                num_heads=8,
                dropout=0.1,
                max_len=2048,
                mode='train'):
    """Transformer.

  This model expects on input a pair (source, target).

  Args:
    vocab_size: int: vocab size (shared source and target).
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_layers: int: number of encoder/decoder layers
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    the Transformer model.
  """
    embedding = tl.Serial(tl.Embedding(feature_depth, vocab_size),
                          tl.Dropout(rate=dropout, mode=mode),
                          tl.PositionalEncoding(max_len=max_len))
    encoder = tl.Serial(
        tl.Branch(embedding, tl.PaddingMask()),
        tl.Serial(*[
            EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout,
                         mode) for _ in range(num_layers)
        ]), tl.Parallel(tl.LayerNorm(), tl.NoOp()))
    stack = [
        EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads,
                            dropout, mode) for _ in range(num_layers)
    ]
    return tl.Serial(
        tl.Parallel(tl.NoOp(), tl.ShiftRight()),
        tl.Parallel(encoder, embedding),
        tl.Select(inputs=(('encoder', 'mask'), 'decoder'),
                  output=('encoder', ('mask', 'decoder'), 'decoder')),
        tl.Parallel(  # (encoder_mask, decoder_input) -> encoder-decoder mask
            tl.NoOp(), tl.EncoderDecoderMask(), tl.NoOp()),
        tl.Serial(*stack),
        tl.Select(2),  # Drop encoder and mask.
        tl.LayerNorm(),
        tl.Dense(vocab_size),
        tl.LogSoftmax())
Beispiel #20
0
def TransformerEncoder(vocab_size,
                       num_classes=10,
                       feature_depth=512,
                       feedforward_depth=2048,
                       num_layers=6,
                       num_heads=8,
                       dropout=0.1,
                       max_len=2048,
                       mode='train'):
  """Transformer encoder.

  Args:
    vocab_size: int: vocab size
    num_classes: how many classes on output
    feature_depth: int:  depth of embedding
    feedforward_depth: int: depth of feed-forward layer
    num_layers: int: number of encoder/decoder layers
    num_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    the Transformer encoder layer.
  """
  input_embedding = layers.Serial(
      layers.Embedding(feature_depth, vocab_size),
      layers.Dropout(rate=dropout, mode=mode),
      layers.PositionalEncoding(max_len=max_len)
  )
  return layers.Serial(
      layers.Branch(),  # Branch input to create embedding and mask.
      layers.Parallel(input_embedding, layers.PaddingMask()),
      layers.Serial(*[EncoderLayer(feature_depth, feedforward_depth, num_heads,
                                   dropout, mode)
                      for _ in range(num_layers)]),
      layers.FirstBranch(),  # Drop the mask.
      layers.LayerNorm(),
      layers.Mean(axis=1),  # Average on length.
      layers.Dense(num_classes),
      layers.LogSoftmax()
  )
Beispiel #21
0
def TransformerEncoder(vocab_size,
                       n_classes=10,
                       d_feature=512,
                       d_feedforward=2048,
                       n_layers=6,
                       n_heads=8,
                       dropout=0.1,
                       max_len=2048,
                       mode='train'):
    """Transformer encoder.

  Args:
    vocab_size: int: vocab size
    n_classes: how many classes on output
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_layers: int: number of encoder/decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'

  Returns:
    the Transformer encoder layer.
  """
    positional_embedder = [
        tl.Embedding(d_feature, vocab_size),
        tl.Dropout(rate=dropout, mode=mode),
        tl.PositionalEncoding(max_len=max_len),
    ]
    return [
        tl.Branch(positional_embedder, tl.PaddingMask()),  # Create mask.
        [
            EncoderBlock(d_feature, d_feedforward, n_heads, dropout, mode)
            for _ in range(n_layers)
        ],
        tl.Select(0),  # Drop mask.
        tl.LayerNorm(),
        tl.Mean(axis=1),  # Average on length.
        tl.Dense(n_classes),
        tl.LogSoftmax(),
    ]
Beispiel #22
0
def policy_and_value_net(rng_key,
                         batch_observations_shape,
                         num_actions,
                         bottom_layers=None):
  """A policy and value net function."""

  # Layers.
  cur_layers = []
  if bottom_layers is not None:
    cur_layers.extend(bottom_layers)

  # Now, with the current logits, one head computes action probabilities and the
  # other computes the value function.
  # NOTE: The LogSoftmax instead of the Softmax because of numerical stability.
  cur_layers.extend([
      layers.Branch(
          layers.Serial(layers.Dense(num_actions), layers.LogSoftmax()),
          layers.Dense(1))
  ])
  net = layers.Serial(*cur_layers)
  return net.initialize(batch_observations_shape, rng_key), net
def DecoderBlock(d_feature, d_feedforward, n_heads, n_attention_chunks,
                 dropout, mode):
    """Reversible transformer decoder layer.

  Args:
    d_feature: int:  depth of embedding
    d_feedforward: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    n_attention_chunks: int: number of chunks for memory-efficient attention
    dropout: float: dropout rate (how much to drop out)
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    self_attention = [
        tl.LayerNorm(),
        tl.Branch([], tl.CausalMask(axis=-2)),  # Create mask.
        tl.MultiHeadedAttention(d_feature,
                                n_heads=n_heads,
                                dropout=dropout,
                                mode=mode),
        tl.Select(0),  # Drop mask.
        tl.Dropout(rate=dropout, mode=mode),
    ]

    # TODO(kitaev): Memory-efficient attention. This chunking is temporary.
    self_attention = [
        Split(sections=n_attention_chunks, axis=-2),  # pylint: disable=no-value-for-parameter
        Map(self_attention),
        tl.Concatenate(axis=-2),
    ]

    feed_forward = [
        FeedForward(d_feature, d_feedforward, dropout, mode=mode),
    ]
    return [
        ReversibleResidual([self_attention], [feed_forward]),
    ]
def Residual(*layers, **unused_kwargs):
    """Constructs a residual version of layers, summing input to layers output."""
    return tl.Serial(tl.Branch(tl.Serial(*layers), tl.NoOp()), tl.AddAll())
Beispiel #25
0
def Residual(*layers, **unused_kwargs):
    """Constructs a residual version of layers, summing input to layers output."""
    return [tl.Branch(layers, []), tl.AddAll()]