def policy_and_value_net(rng_key, batch_observations_shape, n_actions, bottom_layers_fn=(), two_towers=True): """A policy and value net function.""" # Layers. # Now, with the current logits, one head computes action probabilities and the # other computes the value function. # NOTE: The LogSoftmax instead of the Softmax because of numerical stability. if two_towers: net = tl.Branch( [bottom_layers_fn(), tl.Dense(n_actions), tl.LogSoftmax()], [bottom_layers_fn(), tl.Dense(1)]) else: net = tl.Serial( bottom_layers_fn(), tl.Branch( [tl.Dense(n_actions), tl.LogSoftmax()], [tl.Dense(1)])) return net.initialize(batch_observations_shape, rng_key), net
def policy_and_value_net(rng_key, batch_observations_shape, num_actions, bottom_layers_fn=None, two_towers=True): """A policy and value net function.""" # Layers. # Now, with the current logits, one head computes action probabilities and the # other computes the value function. # NOTE: The LogSoftmax instead of the Softmax because of numerical stability. net = None if not two_towers: tower = [] if bottom_layers_fn is None else bottom_layers_fn() tower.extend([ layers.Branch( layers.Serial(layers.Dense(num_actions), layers.LogSoftmax()), layers.Dense(1)) ]) net = layers.Serial(*tower) else: tower1 = [] if bottom_layers_fn is None else bottom_layers_fn() tower2 = [] if bottom_layers_fn is None else bottom_layers_fn() tower1.extend([layers.Dense(num_actions), layers.LogSoftmax()]) tower2.extend([layers.Dense(1)]) net = layers.Branch( layers.Serial(*tower1), layers.Serial(*tower2), ) assert net return net.initialize(batch_observations_shape, rng_key), net
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple pair (encoder, mask, decoder_input) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (encoder, mask, decoder_activations). """ # Decoder self-attending to decoder. self_attention = layers.Residual( layers.LayerNorm(), layers.Branch(), layers.Parallel( layers.Identity(), # activation for (q, k, v) layers.CausalMask(axis=-2)), # attention mask layers.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode)) # Decoder attending to encoder. encoder_decoder_attention = layers.Serial( layers.Reorder(output=((2, 0, 0), 1)), # ((dec, enc, enc), mask) layers.MultiHeadedAttentionQKV( # ((q, k, v), mask) --> new v feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode), ) return layers.Serial( layers.Parallel(layers.Identity(), layers.Identity(), self_attention), layers.Branch(), layers.Parallel(layers.Identity(), encoder_decoder_attention), layers.UnnestBranches(), # (encoder, mask, old_act, new_act) layers.Reorder(output=(0, 1, (2, 3))), layers.Parallel( # Residual after encoder-decoder attention. layers.Identity(), layers.Identity(), layers.SumBranches()), layers.Parallel( # Feed-forward on the third component (decoder). layers.Identity(), layers.Identity(), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode)))
def EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer encoder-decoder layer. The input is a triple pair (encoder, mask, decoder_input) where the mask is created from the original source to prevent attending to the padding part of the encoder. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer, returning a triple (encoder, mask, decoder_activations). """ # Decoder self-attending to decoder. self_attention = tl.Residual( tl.LayerNorm(), tl.Branch(tl.NoOp(), tl.CausalMask(axis=-2)), # create mask tl.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Select(0), # drop mask tl.Dropout(rate=dropout, mode=mode)) # Decoder attending to encoder. encoder_decoder_attention = tl.Serial( tl.Select((2, 0, 0, 1)), # (dec, enc, enc, mask) tl.MultiHeadedAttentionQKV( # (q, k, v, mask) --> new, mask feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Select(0), # drop the mask tl.Dropout(rate=dropout, mode=mode), ) return tl.Serial( tl.Parallel(tl.NoOp(), tl.NoOp(), self_attention), tl.Branch(tl.NoOp(), encoder_decoder_attention), tl.Select(inputs=(('encoder', 'mask', 'old_act'), 'new_act'), output=('encoder', 'mask', ('old_act', 'new_act'))), tl.Parallel( # Residual after encoder-decoder attention. tl.NoOp(), tl.NoOp(), tl.Add()), tl.Parallel( # Feed-forward on the third component (decoder). tl.NoOp(), tl.NoOp(), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode)))
def AtariCnn(hidden_sizes=(32, 32), output_size=128): # Input's shape = (B, T, H, W, C) return tl.Serial( tl.Div(divisor=255.0), # Have 4 copies of the input, each one shifted to the right by one. tl.Branch( tl.NoOp(), tl.ShiftRight(), tl.Serial( tl.ShiftRight(), tl.ShiftRight(), ), tl.Serial( tl.ShiftRight(), tl.ShiftRight(), tl.ShiftRight(), )), # Concatenated on the last axis. tl.Concatenate(axis=-1), # (B, T, H, W, 4C) tl.Rebatch(tl.Conv(hidden_sizes[0], (5, 5), (2, 2), 'SAME'), 2), tl.Relu(), tl.Rebatch(tl.Conv(hidden_sizes[1], (5, 5), (2, 2), 'SAME'), 2), tl.Relu(), tl.Flatten(num_axis_to_keep=2), # B, T and rest. tl.Dense(output_size), tl.Relu(), # Eventually this is shaped (B, T, output_size) )
def Encoder(source, source_mask): """Transformer encoder stack. Args: source: layer variable: raw source sequences source_mask: layer variable: self-attention mask Returns: Layer variable that outputs encoded source. """ encoder_layer = layers.Serial( # input attends to self layers.Residual( layers.LayerNorm(), layers.Branch(size=4), layers.Parallel( layers.Identity(), # query layers.Identity(), # key layers.Identity(), # value source_mask), # attention mask multi_attention, layers.Dropout(dropout, mode=mode)), # feed-forward ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode), ) return layers.Serial( source, source_embedding_layer, layers.repeat(encoder_layer, num_layers), layers.LayerNorm(), )
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer decoder layer. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ return layers.Serial( layers.Residual( # Self-attention block. layers.LayerNorm(), layers.Branch(), layers.Parallel( layers.Identity(), # activation for (q, k, v) layers.CausalMask(axis=-2)), # attention mask layers.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), layers.Dropout(rate=dropout, mode=mode)), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode))
def DecoderLayer(positions, d_feature, d_feedforward, n_heads, dropout, mode): """Transformer decoder layer. Args: positions: random vectors for positions d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ return [ tl.Residual( # Self-attention block. PreservePosition(tl.LayerNorm()), tl.Branch([], # activation for (q, k, v) tl.CausalMask(axis=-2)), # attention mask MultiHeadedAttentionPosition(positions, d_feature, n_heads=n_heads, dropout=dropout, mode=mode), PreservePosition(tl.Dropout(rate=dropout, mode=mode)) ), ResidualFeedForward(d_feature, d_feedforward, dropout, mode=mode) ]
def DecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode): """Transformer decoder layer. Args: feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ return tl.Serial( tl.Residual( # Self-attention block. tl.LayerNorm(), tl.Branch(tl.Copy(), tl.CausalMask(axis=-2)), # Create mask. tl.MultiHeadedAttention(feature_depth, num_heads=num_heads, dropout=dropout, mode=mode), tl.Select(0), # Drop the mask. tl.Dropout(rate=dropout, mode=mode)), ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode))
def SumLearnedPick(positions): """Get a pair (vec, pos) and pick new pos.""" succ_keys = positions[:-1, :] succ_values = positions[1:, :] subtract_1_keys = positions[1:, :] subtract_1_values = positions[:-1, :] l = int(positions.shape[0]) // 2 add_keys = np.array([np.concatenate([positions[i, :], positions[j, :]]) for i in range(l) for j in range(l)]) add_values = np.array([positions[i + j, :] for i in range(l) for j in range(l)]) # TODO(lukaszkaiser): try this below: "for j in range(i) for i in range(2*l)" sub_keys = np.array([np.concatenate([positions[i, :], positions[j, :]]) for j in range(l) for i in range(l)]) sub_values = np.array([positions[max(i - j, 0), :] for j in range(l) for i in range(l)]) return tl.Serial( tl.Branch( LearnedQP(), LearnedQP(keys=succ_keys, values=succ_values), LearnedQP(keys=subtract_1_keys, values=subtract_1_values), LearnedQP(keys=add_keys, values=add_values, binary=True), LearnedQP(keys=sub_keys, values=sub_values, binary=True), ), Unnest(), SoftmaxBranches(n_branches=5) )
def DecoderBlock(d_feature, d_feedforward, n_heads, dropout, mode): """Transformer decoder layer. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ self_attention = [ tl.LayerNorm(), tl.Branch([], tl.CausalMask(axis=-2)), # Create mask. tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Select(0), # Drop mask. tl.Dropout(rate=dropout, mode=mode), ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ tl.Residual(self_attention), tl.Residual(feed_forward), ]
def Decoder(memory, target, target_mask, memory_mask): """Transformer decoder stack. Args: memory: layer variable: encoded source sequences target: layer variable: raw target sequences target_mask: layer variable: self-attention mask memory_mask: layer variable: memory attention mask Returns: Layer variable that outputs encoded source. """ decoder_layer = layers.Serial( # target attends to self layers.Residual( layers.LayerNorm(), layers.Branch(size=4), layers.Parallel( layers.Identity(), # query layers.Identity(), # key layers.Identity(), # value target_mask), # attention mask multi_attention, layers.Dropout(dropout, mode=mode)), # target attends to encoded source layers.Residual( layers.LayerNorm(), layers.Branch(size=4), layers.Parallel( layers.Identity(), # query memory, # key memory, # value memory_mask), # attention mask multi_attention, layers.Dropout(dropout, mode=mode)), # feed-forward ResidualFeedForward(feature_depth, feedforward_depth, dropout, mode=mode)) return layers.Serial( target, target_embedding_layer, layers.repeat(decoder_layer, num_layers), layers.LayerNorm(), )
def WideResnetBlock(channels, strides=(1, 1), channel_mismatch=False): """WideResnet convolutational block.""" main = layers.Serial( layers.BatchNorm(), layers.Relu(), layers.Conv(channels, (3, 3), strides, padding='SAME'), layers.BatchNorm(), layers.Relu(), layers.Conv(channels, (3, 3), padding='SAME')) shortcut = layers.Identity() if not channel_mismatch else layers.Conv( channels, (3, 3), strides, padding='SAME') return layers.Serial(layers.Branch(), layers.Parallel(main, shortcut), layers.SumBranches())
def Transformer(vocab_size, d_feature=512, d_feedforward=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train'): """Transformer. This model expects on input a pair (source, target). Args: vocab_size: int: vocab size (shared source and target). d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' Returns: the Transformer model. """ positional_embedder = [ tl.Embedding(d_feature, vocab_size), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len), ] encoder = [ tl.Branch(positional_embedder, tl.PaddingMask()), [ EncoderBlock(d_feature, d_feedforward, n_heads, dropout, mode) for _ in range(n_layers) ], tl.LayerNorm(), ] return tl.Model( tl.Parallel([], tl.ShiftRight()), tl.Parallel(encoder, positional_embedder), tl.Select(inputs=(('encoder', 'mask'), 'decoder'), output=('decoder', ('mask', 'decoder'), 'encoder')), # (encoder_mask, decoder_input) -> encoder-decoder mask tl.Parallel([], tl.EncoderDecoderMask(), []), [ EncoderDecoder(d_feature, d_feedforward, n_heads, dropout, mode) for _ in range(n_layers) ], tl.Select(0), # Drop mask and encoder. tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax(), )
def ChunkedCausalMultiHeadedAttention(d_feature, n_heads=8, dropout=0.0, chunk_selector=None, mode='train'): """Transformer-style causal multi-headed attention operating on chunks. Accepts inputs that are a list of chunks and applies causal attention. Args: d_feature: int: depth of embedding n_heads: int: number of attention heads dropout: float: dropout rate chunk_selector: a function from chunk number to list of chunks to attend. mode: str: 'train' or 'eval' Returns: Multi-headed self-attention layer. """ prepare_attention_input = tl.Serial( tl.Branch( tl.Branch( # q = k = v = first input tl.NoOp(), tl.NoOp(), tl.NoOp()), tl.CausalMask(axis=-2), ), tl.Parallel( tl.Parallel( tl.Dense(d_feature), tl.Dense(d_feature), tl.Dense(d_feature), ), tl.NoOp())) return tl.Serial( tl.Map(prepare_attention_input), ChunkedAttentionSelector(selector=chunk_selector), # pylint: disable=no-value-for-parameter tl.Map(tl.PureMultiHeadedAttention(d_feature=d_feature, n_heads=n_heads, dropout=dropout, mode=mode), check_shapes=False), tl.Map(tl.Select(0), check_shapes=False), # drop masks tl.Map(tl.Dense(d_feature)))
def Transformer(vocab_size, feature_depth=512, feedforward_depth=2048, num_layers=6, num_heads=8, dropout=0.1, max_len=2048, mode='train'): """Transformer. This model expects on input a pair (source, target). Args: vocab_size: int: vocab size (shared source and target). feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_layers: int: number of encoder/decoder layers num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' Returns: the Transformer model. """ embedding = layers.Serial(layers.Embedding(feature_depth, vocab_size), layers.Dropout(rate=dropout, mode=mode), layers.PositionalEncoding(max_len=max_len)) encoder = layers.Serial( layers.Branch(), # Branch input to create embedding and mask. layers.Parallel(embedding, layers.PaddingMask()), layers.Serial(*[ EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode) for _ in range(num_layers) ]), layers.Parallel(layers.LayerNorm(), layers.Identity())) stack = [ EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode) for _ in range(num_layers) ] return layers.Serial( layers.Parallel(layers.Identity(), layers.ShiftRight()), layers.Parallel(encoder, embedding), layers.UnnestBranches(), # (encoder, encoder_mask, decoder_input) layers.Reorder(output=(0, (1, 2), 2)), layers. Parallel( # (encoder_mask, decoder_input) -> encoder-decoder mask layers.Identity(), layers.EncoderDecoderMask(), layers.Identity()), layers.Serial(*stack), layers.ThirdBranch(), layers.LayerNorm(), layers.Dense(vocab_size), layers.LogSoftmax())
def IdentityBlock(kernel_size, filters): """ResNet identical size block.""" ks = kernel_size filters1, filters2, filters3 = filters main = layers.Serial(layers.Conv(filters1, (1, 1)), layers.BatchNorm(), layers.Relu(), layers.Conv(filters2, (ks, ks), padding='SAME'), layers.BatchNorm(), layers.Relu(), layers.Conv(filters3, (1, 1)), layers.BatchNorm()) return layers.Serial(layers.Branch(), layers.Parallel(main, layers.Identity()), layers.SumBranches(), layers.Relu())
def ConvBlock(kernel_size, filters, strides): """ResNet convolutional striding block.""" ks = kernel_size filters1, filters2, filters3 = filters main = layers.Serial(layers.Conv(filters1, (1, 1), strides), layers.BatchNorm(), layers.Relu(), layers.Conv(filters2, (ks, ks), padding='SAME'), layers.BatchNorm(), layers.Relu(), layers.Conv(filters3, (1, 1)), layers.BatchNorm()) shortcut = layers.Serial(layers.Conv(filters3, (1, 1), strides), layers.BatchNorm()) return layers.Serial(layers.Branch(), layers.Parallel(main, shortcut), layers.SumBranches(), layers.Relu())
def Transformer(vocab_size, feature_depth=512, feedforward_depth=2048, num_layers=6, num_heads=8, dropout=0.1, max_len=2048, mode='train'): """Transformer. This model expects on input a pair (source, target). Args: vocab_size: int: vocab size (shared source and target). feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_layers: int: number of encoder/decoder layers num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' Returns: the Transformer model. """ embedding = tl.Serial(tl.Embedding(feature_depth, vocab_size), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len)) encoder = tl.Serial( tl.Branch(embedding, tl.PaddingMask()), tl.Serial(*[ EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode) for _ in range(num_layers) ]), tl.Parallel(tl.LayerNorm(), tl.NoOp())) stack = [ EncoderDecoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode) for _ in range(num_layers) ] return tl.Serial( tl.Parallel(tl.NoOp(), tl.ShiftRight()), tl.Parallel(encoder, embedding), tl.Select(inputs=(('encoder', 'mask'), 'decoder'), output=('encoder', ('mask', 'decoder'), 'decoder')), tl.Parallel( # (encoder_mask, decoder_input) -> encoder-decoder mask tl.NoOp(), tl.EncoderDecoderMask(), tl.NoOp()), tl.Serial(*stack), tl.Select(2), # Drop encoder and mask. tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax())
def TransformerEncoder(vocab_size, num_classes=10, feature_depth=512, feedforward_depth=2048, num_layers=6, num_heads=8, dropout=0.1, max_len=2048, mode='train'): """Transformer encoder. Args: vocab_size: int: vocab size num_classes: how many classes on output feature_depth: int: depth of embedding feedforward_depth: int: depth of feed-forward layer num_layers: int: number of encoder/decoder layers num_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' Returns: the Transformer encoder layer. """ input_embedding = layers.Serial( layers.Embedding(feature_depth, vocab_size), layers.Dropout(rate=dropout, mode=mode), layers.PositionalEncoding(max_len=max_len) ) return layers.Serial( layers.Branch(), # Branch input to create embedding and mask. layers.Parallel(input_embedding, layers.PaddingMask()), layers.Serial(*[EncoderLayer(feature_depth, feedforward_depth, num_heads, dropout, mode) for _ in range(num_layers)]), layers.FirstBranch(), # Drop the mask. layers.LayerNorm(), layers.Mean(axis=1), # Average on length. layers.Dense(num_classes), layers.LogSoftmax() )
def TransformerEncoder(vocab_size, n_classes=10, d_feature=512, d_feedforward=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train'): """Transformer encoder. Args: vocab_size: int: vocab size n_classes: how many classes on output d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' Returns: the Transformer encoder layer. """ positional_embedder = [ tl.Embedding(d_feature, vocab_size), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len), ] return [ tl.Branch(positional_embedder, tl.PaddingMask()), # Create mask. [ EncoderBlock(d_feature, d_feedforward, n_heads, dropout, mode) for _ in range(n_layers) ], tl.Select(0), # Drop mask. tl.LayerNorm(), tl.Mean(axis=1), # Average on length. tl.Dense(n_classes), tl.LogSoftmax(), ]
def policy_and_value_net(rng_key, batch_observations_shape, num_actions, bottom_layers=None): """A policy and value net function.""" # Layers. cur_layers = [] if bottom_layers is not None: cur_layers.extend(bottom_layers) # Now, with the current logits, one head computes action probabilities and the # other computes the value function. # NOTE: The LogSoftmax instead of the Softmax because of numerical stability. cur_layers.extend([ layers.Branch( layers.Serial(layers.Dense(num_actions), layers.LogSoftmax()), layers.Dense(1)) ]) net = layers.Serial(*cur_layers) return net.initialize(batch_observations_shape, rng_key), net
def DecoderBlock(d_feature, d_feedforward, n_heads, n_attention_chunks, dropout, mode): """Reversible transformer decoder layer. Args: d_feature: int: depth of embedding d_feedforward: int: depth of feed-forward layer n_heads: int: number of attention heads n_attention_chunks: int: number of chunks for memory-efficient attention dropout: float: dropout rate (how much to drop out) mode: str: 'train' or 'eval' Returns: the layer. """ self_attention = [ tl.LayerNorm(), tl.Branch([], tl.CausalMask(axis=-2)), # Create mask. tl.MultiHeadedAttention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode), tl.Select(0), # Drop mask. tl.Dropout(rate=dropout, mode=mode), ] # TODO(kitaev): Memory-efficient attention. This chunking is temporary. self_attention = [ Split(sections=n_attention_chunks, axis=-2), # pylint: disable=no-value-for-parameter Map(self_attention), tl.Concatenate(axis=-2), ] feed_forward = [ FeedForward(d_feature, d_feedforward, dropout, mode=mode), ] return [ ReversibleResidual([self_attention], [feed_forward]), ]
def Residual(*layers, **unused_kwargs): """Constructs a residual version of layers, summing input to layers output.""" return tl.Serial(tl.Branch(tl.Serial(*layers), tl.NoOp()), tl.AddAll())
def Residual(*layers, **unused_kwargs): """Constructs a residual version of layers, summing input to layers output.""" return [tl.Branch(layers, []), tl.AddAll()]