def PositionalEncoder(vocab_size): # tokens --> vectors return [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len), ]
def FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, use_bfloat16=False, ff_sparsity_type='1inN'): """Feed-Forward block with all the options. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, tuple or string; if not 0, use sparse feed-forward block with this sparsity mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. use_bfloat16: whether to use bfloat16 for weights (default: False). ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` Returns: A list of layers which maps vectors to vectors. """ if ff_sparsity and ff_sparsity_type == '1inN': temperature, quant_prob = 0.1, 0.3 if isinstance(ff_sparsity, str): # This is hacky but used to pass ff_sparsity in yaml sweep files. ff_sparsity = [(float(x) if '.' in x else int(x)) for x in ff_sparsity.split()] if isinstance(ff_sparsity, (list, tuple)): if len(ff_sparsity) == 2: n_elements_in_block, d_lowrank = ff_sparsity else: n_elements_in_block, d_lowrank, temperature, quant_prob = ff_sparsity else: assert isinstance(ff_sparsity, int) n_elements_in_block, d_lowrank = ff_sparsity, d_ff // ff_sparsity ff = tl.SparseFF(d_ff, n_elements_in_block=n_elements_in_block, d_lowrank=d_lowrank, temperature=temperature, quant_prob=quant_prob, use_bfloat16=use_bfloat16, mode=mode, dropout_rate=dropout, dropout_shared_axes=dropout_shared_axes, ff_chunk_size=ff_chunk_size) elif ff_sparsity and ff_sparsity_type == 'Block': ff = tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode) else: ff = _FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, use_bfloat16, mode) res = [tl.LayerNorm(), ff] if ff_sparsity_type != '1inN' or ff_sparsity == 0: # SparseFF has Dropout and BatchLeadingAxes built-in. res.append( tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode)) if ff_chunk_size > 0: res = tl.BatchLeadingAxes(tl.Chunk(tl.Serial(res), ff_chunk_size)) if ff_use_sru: if isinstance(ff_use_sru, (list, tuple)): sru_n_layers, sru_n_units = ff_use_sru else: sru_n_layers, sru_n_units = ff_use_sru, 32 sru = [tl.SRU(sru_n_units, mode=mode) for _ in range(sru_n_layers)] block = [tl.LayerNorm(), tl.Dense(sru_n_units) ] + sru + [tl.Dense(d_model)] res = tl.Residual(block, shortcut=res) return [res]
def DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, ff_sparsity_type, attention_chunk_size, attention_type, n_attention_layers=1, n_feedforward_layers=1): """Returns a list of layers that implements a Transformer decoder block. The input is an activation tensor. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` attention_chunk_size: int, if > 0 run attention chunked at this size attention_type: The attention layer to use. n_attention_layers: how many residual causal attention layers should we have before the feed-forward block (default: 1, the standard block) n_feedforward_layers: how many FFNN layers should we have (default 1). Returns: A list of layers that maps an activation tensor to an activation tensor. """ # pylint: disable=g-complex-comprehension causal_attentions = [ ApplyAttentionLayer(attention_type, d_model, n_heads, d_model // n_heads, d_model // n_heads, causal=True, masked=False, attention_dropout=dropout, output_dropout=dropout, attention_chunk_size=attention_chunk_size, mode=mode) for _ in range(n_attention_layers) ] residual_attentions = [ tl.Residual( tl.LayerNorm(), causal_attentions[i], tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode)) for i in range(n_attention_layers) ] feed_forwards = [ tl.Residual( FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, False, ff_sparsity_type)) for _ in range(n_feedforward_layers) ] # pylint: enable=g-complex-comprehension return residual_attentions + feed_forwards
def test_call_in_eval_mode_does_no_dropout(self): layer = tl.Dropout(rate=0.1, mode='eval') x = np.ones((2, 5, 1000)) y = layer(x) self.assertEqual(np.count_nonzero(y), 10_000)
def createPosEncoder(vocabSize, embeddingDepth, dropout, maxLength, mode): return [ tl.Embedding(vocabSize, embeddingDepthembeddingDepth), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=maxLength, mode=mode) ]
def ReformerLM(vocab_size, d_model=512, d_ff=2048, d_attention_key=64, d_attention_value=64, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, attention_type=tl.SelfAttention, axial_pos_shape=(), d_axial_pos_embs=None, ff_activation=tl.FastGelu, ff_use_sru=0, ff_chunk_size=0, mode='train'): """Reversible transformer language model (only uses a decoder, no encoder). Args: vocab_size: int: vocab size d_model: int: depth of *each half* of the two-part features d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding attention_type: class: attention class to use, such as SelfAttention. axial_pos_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. d_axial_pos_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match axial_pos_shape, and values must sum to d_model. ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks mode: str: 'train', 'eval', or 'predict' Returns: the layer. """ positional_encoding = PositionalEncoding(mode, dropout, max_len, axial_pos_shape, d_axial_pos_embs) positional_embedder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), # pylint: disable=no-value-for-parameter positional_encoding, ] decoder_blocks = [] if isinstance(attention_type, (tuple, list)): assert n_layers % len(attention_type) == 0 else: attention_type = [attention_type] for layer_idx in range(n_layers): layer_attention_type = attention_type[layer_idx % len(attention_type)] decoder_block = DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value, n_heads, attention_type=layer_attention_type, dropout=dropout, ff_activation=ff_activation, ff_use_sru=ff_use_sru, ff_chunk_size=ff_chunk_size, mode=mode) decoder_blocks.append(decoder_block) return tl.Serial( tl.ShiftRight(mode=mode), positional_embedder, tl.Dup(), tl.ReversibleSerial(decoder_blocks), tl.Concatenate(), # TODO(kitaev): Test whether dropout should go before or after the # LayerNorm, and whether dropout broadcasting is needed here. tl.LayerNorm(), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), # pylint: disable=no-value-for-parameter tl.Dense(vocab_size), tl.LogSoftmax(), )
def LayerDropTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train', ff_activation=tl.Relu, skip_fraction=0.4): """Returns a LayerDrop Transformer language model. The input to the model is a tensor of tokens. (This model uses only the decoder part of the overall Transformer.) Args: vocab_size: int: vocab size d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference ff_activation: the non-linearity in feed-forward layer skip_fraction: probability of skipping a layer; it can be a single probability or a list of probabilities different for each layer Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ embedder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode), ] if not isinstance(skip_fraction, (list, tuple)): # If we don't get a list of skip_fractions we use the same skip_fraction # for each layer. skip_fraction = [skip_fraction for i in range(n_layers)] if len(skip_fraction) != n_layers: raise ValueError('n_layers ({}) must be equal to len(skip_fraction) ({})' .format(n_layers, len(skip_fraction))) def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding tl.RandomUniform(0., 1, sync=True), # stack: random_uniform, embedding tl.Cond( # if random_uniform > skip_fraction LargerThan(skip_fraction[current_layer_num] if mode == 'train' else 0.0), # then: run block tl.Serial(transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)), # else: run noop tl.Serial() ) # stack: embedding ) return tl.Serial( tl.ShiftRight(mode=mode), embedder, [ConditionedBlock(i) for i in range(n_layers)], tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax(), )
def ReformerLM(vocab_size, d_model=512, d_ff=2048, d_attention_key=64, d_attention_value=64, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, attention_type=tl.SelfAttention, pos_type=None, pos_axial_shape=(), pos_d_axial_embs=None, ff_activation=tl.FastGelu, ff_use_sru=0, ff_chunk_size=0, ff_sparsity=0, loss_sparsity_type='mult', loss_sparsity=0, loss_d_lowrank=0, loss_sparsity_prob=None, attention_chunk_size=0, mode='train'): """Reversible transformer language model (only uses a decoder, no encoder). Args: vocab_size: int: vocab size d_model: int: depth of *each half* of the two-part features d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding attention_type: class: attention class to use, such as SelfAttention. pos_type: string, the type of positional embeddings to use. pos_axial_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. pos_d_axial_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match pos_axial_shape, and values must sum to d_model. ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity loss_sparsity_type: str, type of sparsity to used in loss layer. See SparseDenseWithOptions for options. None if no sparsity should be used. loss_sparsity: int, the sparsity for loss layer (if used) loss_d_lowrank: int, the dimensions for intermediate layer (if used) loss_sparsity_prob: float, the probability for sparse version of loss to be used. If None, only sparse version is used. attention_chunk_size: int, if > 0 run attention chunked at this size mode: str: 'train', 'eval', or 'predict' Returns: the layer. """ positional_encoding = ct.PositionalEncoder(mode, dropout, max_len, pos_type, pos_axial_shape, pos_d_axial_embs) positional_embedder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), # pylint: disable=no-value-for-parameter positional_encoding, ] decoder_blocks = [] if isinstance(attention_type, (tuple, list)): assert n_layers % len(attention_type) == 0 else: attention_type = [attention_type] for layer_idx in range(n_layers): layer_attention_type = attention_type[layer_idx % len(attention_type)] decoder_block = DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value, n_heads, attention_type=layer_attention_type, dropout=dropout, ff_activation=ff_activation, ff_dropout=dropout, ff_use_sru=ff_use_sru, ff_chunk_size=ff_chunk_size, ff_sparsity=ff_sparsity, attention_chunk_size=attention_chunk_size, mode=mode) decoder_blocks.append(decoder_block) dense_loss_layer = tl.SparseDenseWithOptions( vocab_size, d_input=d_model, sparsity_type=loss_sparsity_type, sparsity=loss_sparsity, d_lowrank=loss_d_lowrank, prob_sparse=loss_sparsity_prob, mode=mode) return tl.Serial( tl.ShiftRight(mode=mode), positional_embedder, tl.Dup(), tl.ReversibleSerial(decoder_blocks), tl.Concatenate(), # TODO(kitaev): Test whether dropout should go before or after the # LayerNorm, and whether dropout broadcasting is needed here. tl.LayerNorm(), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), # pylint: disable=no-value-for-parameter dense_loss_layer, )
def EveryOtherLayerDropTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train', ff_activation=tl.Relu, skip_mode='even', skip_fraction=0.5, eval_skip_fraction=0.0): """Returns an "EveryOther" LayerDrop Transformer language model. During each training step it either runs all layers, or skips a subset of layers. This subset is the same every time, and it is specified by "skip_mode". The input to the model is a tensor of tokens. (This model uses only the decoder part of the overall Transformer.) Args: vocab_size: int: vocab size d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference ff_activation: the non-linearity in feed-forward layer skip_mode: which layers to skip when skipping: even/odd/1half/2half. skip_fraction: fraction of times to skip layers eval_skip_fraction: fraction of times to skip layers during eval Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ embedder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode), ] if mode == 'train': pass else: skip_fraction = eval_skip_fraction skip_mode_funs = { # which layers should be skipped? 'even': (lambda num: num%2 == 0), # 0th layer is even 'odd': (lambda num: num%2 == 1), '1half': (lambda num: num < (n_layers/2)), '2half': (lambda num: num >= (n_layers/2)), } skip_mode_fun = skip_mode_funs[skip_mode] @assert_shape('...sd,->...sd,') def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding, n_layers_to_keep tl.Select([1, 0, 1]), # n_layers_to_keep, embedding, n_layers_to_keep tl.Cond( # if random() > skip_fraction OR layer not in skip_mode ... LargerThan(skip_fraction if skip_mode_fun(current_layer_num ) else 0.0), # then: run block tl.Serial( transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)) # else: noop (implicit) ) # stack: embedding, n_layers_to_keep ) return tl.Serial( tl.ShiftRight(mode=mode), embedder, # stack: embedding tl.RandomUniform(0., 1., sync=True), # stack: n_layers_to_keep, embedding tl.Swap(), # stack: embedding, n_layers_to_keep [ConditionedBlock(i) for i in range(n_layers)], # stack: embedding, n_layers_to_keep tl.Select([0], n_in=2), # stack: embedding tl.LayerNorm(), tl.Dense(vocab_size), )
def _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, ff_sparsity_type, attention_chunk_size, attention_type): """Returns a list of layers that implements a Transformer decoder block. The input is an activation tensor. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` attention_chunk_size: int, if > 0 run attention chunked at this size attention_type: The attention layer to use. Returns: A list of layers that maps an activation tensor to an activation tensor. """ causal_attention = ApplyAttentionLayer( attention_type, d_model, n_heads, d_model // n_heads, d_model // n_heads, causal=True, masked=False, attention_dropout=dropout, output_dropout=dropout, attention_chunk_size=attention_chunk_size, mode=mode) feed_forward = FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, ff_sparsity_type) dropout_ = tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) return [ tl.Residual( tl.LayerNorm(), causal_attention, dropout_, ), tl.Residual(feed_forward), ]
def test_run_reversible_same_as_default_extended(self): """Runs the reversible trainer, check results are the same as default.""" inputs_batch = np.arange(8).reshape((2, 4)) targets_batch = 2 * inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) # We want to test rng propagation too, so adding some dropout layers. first_layer = tl.Serial(tl.Embedding(9, 4), tl.Dropout(0.5), tl.Dup()) rev_layers1 = [ tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.2)), tl.ReversibleSwap(), tl.ReversibleHalfResidual(tl.Dropout(0.5), tl.Dense(4)), tl.ReversibleSwap() ] mid_layer = tl.Serial(tl.Add(), tl.Dense(4), tl.Dup()) rev_layers2 = [ tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.3)), tl.ReversibleSwap() ] loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(19), tl.Dropout(0.3), tl.LogSoftmax(), tl.CrossEntropyLoss()) model = tl.Serial([first_layer] + rev_layers1 + [mid_layer] + rev_layers2 + [loss_layer]) rng_init = fastmath.random.get_prng(12) model.init(labeled_batch, rng=rng_init) optimizer_fn = optimizers.Adam # to test slots # Make 3 steps with the original trainer. optimizer = optimizer_fn() optimizer.tree_init(model.weights) trainer = optimizers.Trainer(model, optimizer) rng_step1 = fastmath.random.get_prng(7) rng_step2 = fastmath.random.get_prng(8) rng_step3 = fastmath.random.get_prng(9) trainer.one_step(labeled_batch, rng_step1) trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02) trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03) first_layer_weights1 = first_layer.weights rev_layer12_weights1 = rev_layers1[2].weights mid_layer_weights1 = mid_layer.weights rev_layer20_weights1 = rev_layers2[0].weights loss_layer_weights1 = loss_layer.weights # Now make 3 steps with reversible trainer. model.init(labeled_batch, rng=rng_init) # TODO(lukaszkaiser): this test seems to fail with memoize_jit, why? trainer = optimizers.ReversibleSerialTrainer( [(first_layer.sublayers, rev_layers1), (mid_layer.sublayers, rev_layers2)], loss_layer, optimizer_fn, memoize_jit=False) trainer.one_step(labeled_batch, rng_step1) trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02) trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03) # Check that weights end up the same. self._assert_all_equal(loss_layer_weights1, loss_layer.weights) self._assert_all_equal(rev_layer20_weights1, rev_layers2[0].weights) self._assert_all_equal(mid_layer_weights1, mid_layer.weights) self._assert_all_equal(rev_layer12_weights1, rev_layers1[2].weights) self._assert_all_equal(first_layer_weights1, first_layer.weights)
def FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, ff_sparsity_type='1inN'): """Feed-Forward block with all the options. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` Returns: A list of layers which maps vectors to vectors. """ if ff_use_sru: return [tl.SRU(d_model) for _ in range(ff_use_sru)] elif ff_sparsity and ff_sparsity_type == '1inN': ff = tl.SparseFF(d_ff, n_elements_in_block=ff_sparsity, d_lowrank=d_ff // ff_sparsity, mode=mode) if ff_chunk_size < 1: chunked_ff = ff else: chunked_ff = tl.BatchLeadingAxes( tl.Chunk(tl.Serial(ff), ff_chunk_size)) return [ tl.LayerNorm(), chunked_ff, tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] elif ff_sparsity and ff_sparsity_type == 'Block': return [ tl.LayerNorm(), tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] else: return [ ChunkedFeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, ff_chunk_size, mode) ]
def Embedder(vocab_size): # tokens --> vectors return [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), ]
def HourglassLM(vocab_size, d_model=512, d_ff=2048, vanilla_layers=(1, 1), hierarchy='6@3', n_heads=8, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.FastGelu, vanilla_attn_type=RelativeAttentionWrapper, middle_attn_type=RelativeAttentionWrapper, downsampling_fn=AttentionResampling, upsampling_fn=AttentionResampling, attention_downsampling_fn=AveragePooling, attention_upsampling_fn=LinearUpsampling): """Returns a hierarchical Transformer language model. This model performs autoregressive language modeling: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 3 tensor representing a batch of log-probability distributions for each sequence position over possible token IDs; shape is (batch_size, sequence_length, `vocab_size`). This model uses only the decoder part of the overall Transformer. Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. vanilla_layers: (pre_layers, post_layers) tuple - number of full token-level Transformer decoder layers before and after shortening. hierarchy: string - shortening hierarchy, as described in the paper. Hierarchy levels must form a palindrome, e.g. '1@2 2@6 1@2'. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: str: 'train' or 'eval'. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. vanilla_attn_type: class: attention class such as SelfAttention to use in the layers before and after shortening (vanilla layers). middle_attn_type: class: attention class to use in the middle layers (these operating on the shortened sequence). downsampling_fn: function that takes full token-level vectors of length `l` and transforms them into `l` / `k` vectors, where `k` denotes `shorten_factor` parameter. upsampling_fn: function that takes shortened representations of a sequence, consisting of `l` / `k` vectors and transforms them into full token-level representations of length `l`. attention_downsampling_fn: Downsampling function that transforms token-level vectors into query vectors with reduced length. Necessary only when AttentionResampling is used as `downsampling_fn`. attention_upsampling_fn: Upsampling function for AttentionResampling. Valid only when AttentionResampling is used as a `upsampling_fn`. Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ assert mode != 'predict' # For now, 'predict' mode is unsupported. hierarchy_n_layers, hierarchy_shorten_factors = _parse_hierarchy(hierarchy) token_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] context_bias_layer, location_bias_layer = get_rel_att_inputs( d_model, n_heads) n_pre_decoder_blocks, n_post_decoder_blocks = vanilla_layers def create_decoder_blocks( n_layers, total_pooling, # pylint: disable = invalid-name attention_type): decoder_blocks = [ # pylint: disable=g-complex-comprehension _RelativeDecoderBlock(attention_type, d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, context_bias_layer, location_bias_layer, total_pooling) for _ in range(n_layers) ] return decoder_blocks + [tl.LayerNorm()] def create_hourglass_valley( rest_shorten_factors, rest_n_funnel_blocks, # pylint: disable = invalid-name current_total_pooling): assert rest_shorten_factors assert len(rest_shorten_factors) == len(rest_n_funnel_blocks) current_sf = rest_shorten_factors[0] current_n_layers = rest_n_funnel_blocks[0] shortening_layer = downsampling_fn( current_sf, d_model, is_upsampling=False, d_ff=d_ff, n_heads=n_heads, dropout=dropout, dropout_shared_axes=dropout_shared_axes, mode=mode, ff_activation=ff_activation, context_bias_layer=context_bias_layer, location_bias_layer=location_bias_layer, total_pooling=current_total_pooling, resampling_fn=attention_downsampling_fn) upsampling_layer = upsampling_fn( current_sf, d_model=d_model, is_upsampling=True, d_ff=d_ff, n_heads=n_heads, dropout=dropout, dropout_shared_axes=dropout_shared_axes, mode=mode, ff_activation=ff_activation, context_bias_layer=context_bias_layer, location_bias_layer=location_bias_layer, total_pooling=current_total_pooling, resampling_fn=attention_upsampling_fn) if len(rest_shorten_factors) > 1: # we need to go deeper again pre_stage_blocks = create_decoder_blocks( current_n_layers, current_total_pooling * current_sf, middle_attn_type) post_stage_blocks = create_decoder_blocks( current_n_layers, current_total_pooling * current_sf, middle_attn_type) return [ tl.Dup(), tl.ShiftRight(current_sf - 1, mode=mode), shortening_layer, pre_stage_blocks, *create_hourglass_valley( rest_shorten_factors[1:], rest_n_funnel_blocks[1:], current_total_pooling * current_sf), post_stage_blocks, upsampling_layer, tl.LayerNorm(), tl.Add() ] else: blocks = create_decoder_blocks(current_n_layers, current_total_pooling * current_sf, middle_attn_type) return [ tl.Dup(), tl.ShiftRight(current_sf - 1), shortening_layer, blocks, upsampling_layer, tl.LayerNorm(), tl.Add() ] pre_decoder_blocks = create_decoder_blocks(n_pre_decoder_blocks, 1, vanilla_attn_type) post_decoder_blocks = create_decoder_blocks(n_post_decoder_blocks, 1, vanilla_attn_type) valley = create_hourglass_valley(hierarchy_shorten_factors, hierarchy_n_layers, 1) # Assemble and return the model. return tl.Serial( # tokens (or chunked tuple of tokens) tl.ShiftRight(mode=mode), # toks token_encoder, # vecs pre_decoder_blocks, # vecs valley, # shortened vecs post_decoder_blocks, # vecs tl.Dense(vocab_size), # vecs )
def FunnelTransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, encoder_segment_lengths=(2, 2, 2), n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu, pool_layer=tl.AvgPool, pool_size=(2, ), strides=(2, ), separate_cls=True): """Returns a Funnel Encoder. This model performs text categorization: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 2 tensor representing a batch of log-probability distributions over N categories; shape is (batch_size, `n_classes`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Final dimension of the output tensors, representing N-way classification. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. encoder_segment_lengths: Tuple, where each element denotes the number of transformer encoder blocks preceding a funnel transformer block. There is no funnel block after the last sequence of encoder blocks, therefore the total number of blocks in the model is equal to `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. pool_layer: Type of pooling layer used for downsampling in each of the funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`. pool_size: Shape of window that gets reduced to a single vector value. If the layer inputs are :math:`n`-dimensional arrays, then `pool_size` must be a tuple of length :math:`n-2`. strides: Offsets from the location of one window to the locations of neighboring windows along each axis. If specified, must be a tuple of the same length as `pool_size`. If None, then offsets of 1 along each window axis, :math:`(1, ..., 1)`, will be used. separate_cls: If `True`, pooling in funnel blocks is not applied to embeddings of the first token (`cls` from BERT paper) and only final embedding of this token is used for categorization - the rest are discarded. If `False`, each token from the beginning is pooled and all embeddings are averaged and mapped to output categories like in original `TransformerEncoder` model. Returns: A Transformer model that maps strings (conveyed via token IDs) to probability-like activations over a range of output classes. """ assert encoder_segment_lengths positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] encoder_blocks = [] n_encoder_segments = len(encoder_segment_lengths) for i in range(n_encoder_segments): # Building i'th segment for _ in range(encoder_segment_lengths[i]): # Create segment_size encoder blocks encoder_blocks.append( _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation)) # If not last segment, add funnel block if i != n_encoder_segments - 1: encoder_blocks.append( _FunnelBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, pool_layer, pool_size, strides, separate_cls)) cls_pooling = SelectFirst() if separate_cls else tl.Mean(axis=1) # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. cls_pooling, # cls tl.Dense(n_classes), # cls tl.LogSoftmax(), # cls )
def LayerDropTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train', ff_activation=tl.Relu, skip_fraction=0.4, eval_skip_fraction='every_other'): """Returns a LayerDrop Transformer language model. Based on Fan, Grave, Joulin 2019, https://arxiv.org/abs/1909.11556 . The input to the model is a tensor of tokens. (This model uses only the decoder part of the overall Transformer.) Args: vocab_size: int: vocab size d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference ff_activation: the non-linearity in feed-forward layer skip_fraction: probability of skipping a layer; it can be a single probability or a list of probabilities different for each layer eval_skip_fraction: probability of skipping a layer during eval; it can be a single probability, or a list of probabilities different for each layer, or a string "every other" implementing a strategy from original paper Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ embedder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode), ] if not isinstance(skip_fraction, (list, tuple)): # If we don't get a list of skip_fractions we use the same skip_fraction # for each layer. skip_fraction = [skip_fraction for i in range(n_layers)] if len(skip_fraction) != n_layers: raise ValueError( 'n_layers ({}) must be equal to len(skip_fraction) ({})'.format( n_layers, len(skip_fraction))) if eval_skip_fraction == 'every_other': # 100% skipping for even-numbered layers; 0% for odd-numbered layers. eval_skip_fraction = [ (1.0 if i % int(1. / skip_fraction[i]) == 0 else 0.0) if skip_fraction[i] != 0 else 0.0 for i in range(n_layers) ] if eval_skip_fraction == 'same': # Same skip_fraction as in training. eval_skip_fraction = skip_fraction if not isinstance(eval_skip_fraction, (list, tuple)): # If we don't get a list of eval_skip_fractions we use the same # eval_skip_fraction for each layer. eval_skip_fraction = [eval_skip_fraction for i in range(n_layers)] if len(eval_skip_fraction) != n_layers: raise ValueError( 'n_layers ({}) must be equal to len(eval_skip_fraction) ({})'. format(n_layers, len(eval_skip_fraction))) @assert_shape('...sd->...sd') def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding tl.RandomUniform(0., 1, sync=True), # stack: random_uniform, embedding tl.Cond( # if random_uniform > skip_fraction LargerThan(skip_fraction[current_layer_num] if mode == 'train' else eval_skip_fraction[current_layer_num]), # then: run block tl.Serial( transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)), # else: run noop tl.Serial()) # stack: embedding ) return tl.Serial( tl.ShiftRight(mode=mode), embedder, [ConditionedBlock(i) for i in range(n_layers)], tl.LayerNorm(), tl.Dense(vocab_size), )
def FunnelTransformer(vocab_size, d_model=512, d_ff=2048, encoder_segment_lengths=(2, 2, 2), n_decoder_blocks=2, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu, pool_layer=tl.AvgPool, pool_size=(2, ), separate_cls=True): """Returns a Full Funnel Transformer, that can be used for example for BERT. This model outputs token-level categorical distributions over all vocab: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 3 tensor representing a batch of log-probability distributions over `vocab_size` categories for each token; shape is (batch_size, sequence_length, vocab_size). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. encoder_segment_lengths: Tuple, where each element denotes the number of transformer encoder blocks preceding a funnel transformer block. There is no funnel block after the last sequence of encoder blocks, therefore the total number of blocks in the model is equal to `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`. n_decoder_blocks: Number of transformer blocks in the upsampling decoder. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. pool_layer: Type of pooling layer used for downsampling in each of the funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`. pool_size: Shape of window that gets reduced to a single vector value. If the layer inputs are :math:`n`-dimensional arrays, then `pool_size` must be a tuple of length :math:`n-2`. separate_cls: If `True`, pooling in funnel blocks is not applied to embeddings of the first token (`cls` from BERT paper) and only final embedding of this token is used for categorization - the rest are discarded. If `False`, each token from the beginning is pooled and all embeddings are averaged and mapped to output categories like in original `TransformerEncoder` model. """ assert encoder_segment_lengths positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] n_encoder_segments = len(encoder_segment_lengths) encoder_blocks_before_first_pooling = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for _ in range(encoder_segment_lengths[0]) ] encoder_blocks_from_first_pooling = [] for i in range(1, n_encoder_segments): # Building i'th segment # Add funnel block between segments encoder_blocks_from_first_pooling.append( _FunnelBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, pool_layer, pool_size=pool_size, strides=pool_size, separate_cls=separate_cls)) for _ in range(encoder_segment_lengths[i]): # Create segment_size encoder blocks encoder_blocks_from_first_pooling.append( _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation)) decoder_blocks = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for _ in range(n_decoder_blocks) ] total_pool_size = pool_size[0]**(len(encoder_segment_lengths) - 1) # Assemble and return the model. return tl.Serial( # toks tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks_before_first_pooling, # vecs masks tl.Select([0, 1, 0, 1]), # vecs masks residual = vecs old_masks encoder_blocks_from_first_pooling, # vecs masks residual masks tl.Select([0, 2, 3]), # vecs residual masks tl.Parallel( # residual from first segment is taken before # normalization, so apply it now None, tl.LayerNorm(), None), # vecs norm(residual) masks _Upsampler(total_pool_size, separate_cls), # vecs masks decoder_blocks, tl.Select([0], n_in=2), # vecs tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax())
def model_fn(mode='train'): return tl.Serial( tl.Dropout(mode=mode, rate=0.1), tl.BatchNorm(mode=mode), models.MLP(d_hidden=16, n_output_classes=n_classes, mode=mode))
def ReformerShortenLM(vocab_size, shorten_factor=1, d_embedding=256, d_model=512, d_ff=2048, d_attention_key=64, d_attention_value=64, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, attention_type=tl.SelfAttention, axial_pos_shape=(), d_axial_pos_embs=None, ff_activation=tl.FastGelu, ff_use_sru=0, ff_chunk_size=0, mode='train'): """Reversible transformer language model with shortening. When shorten_factor is F and processing an input of shape [batch, length], we embed the (shifted-right) input and then group each F elements (on length) into a single vector -- so that in the end we process a tensor of shape :: [batch, length // F, d_model] almost until the end -- at the end it's un-shortend and a SRU is applied. This reduces the length processed inside the main model body, effectively making the model faster but possibly slightly less accurate. Args: vocab_size: int: vocab size shorten_factor: by how much to shorten, see above d_embedding: the depth of the embedding layer and final logits d_model: int: depth of *each half* of the two-part features d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding attention_type: class: attention class to use, such as SelfAttention. axial_pos_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. d_axial_pos_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match axial_pos_shape, values must sum to d_embedding. ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks mode: str: 'train' or 'eval' Returns: the layer. """ assert mode != 'predict' # TODO(lukaszkaiser,kitaev): fast inference if not axial_pos_shape: positional_encoding = tl.PositionalEncoding(max_len=max_len, dropout=dropout, mode=mode) else: assert d_axial_pos_embs is not None positional_encoding = tl.AxialPositionalEncoding( shape=axial_pos_shape, d_embs=d_axial_pos_embs, dropout_broadcast_dims=tuple(range(1, len(axial_pos_shape) + 1)), dropout=dropout, mode=mode) positional_embedder = [ tl.Embedding(vocab_size, d_embedding), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), # pylint: disable=no-value-for-parameter positional_encoding, ] decoder_blocks = [] if isinstance(attention_type, (tuple, list)): assert n_layers % len(attention_type) == 0 else: attention_type = [attention_type] for layer_idx in range(n_layers): layer_attention_type = attention_type[layer_idx % len(attention_type)] decoder_block = DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value, n_heads, attention_type=layer_attention_type, dropout=dropout, ff_activation=ff_activation, ff_use_sru=ff_use_sru, ff_chunk_size=ff_chunk_size, mode=mode) decoder_blocks.append(decoder_block) # pylint: disable=g-long-lambda return tl.Serial( tl.ShiftRight(), positional_embedder, tl.Dup(), # Stack has (x, x), the first will be shortened # Before shortening, we need to pad by shorten factor so as not to leak # information into the future. To understand why, imagine shorten factor # of 2 and sequence of length 4, so ABCD. If we shift just by 1, then we # would have 0ABC, which gets grouped to [0A][BC] on input, which is # predicting ABCD as targets. The problem is that [0A] has access to A # and [BC] has access to C -- it will learn to copy it, peek into # the future. Shifting twice to [00][AB] solves the problem as the first # "big" symbol becomes all-0 and the rest is shifted enough. tl.ShiftRight(n_positions=shorten_factor - 1), tl.Fn( 'Shorten', lambda x: jnp.reshape( # Shorten -- move to depth. x, (x.shape[0], x.shape[1] // shorten_factor, -1)), n_out=1), tl.Dense(d_model), tl.Dup(), # Stack has (short_x, short_x, x) tl.ReversibleSerial(decoder_blocks), tl.Select([0], n_in=2), tl.LayerNorm(), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode), # pylint: disable=no-value-for-parameter tl.Dense(shorten_factor * d_embedding), tl.Fn( 'ProlongBack', lambda x: jnp.reshape( # Prolong back. x, (x.shape[0], x.shape[1] * shorten_factor, -1)), n_out=1), tl.Concatenate(), # Concatenate with just the embeddings. tl.CausalConv(d_embedding), tl.Relu(), tl.SRU(d_embedding), # One RNN layer for conditional dependence. tl.Dense(vocab_size), tl.LogSoftmax())
def TransformerDecoder(vocab_size=None, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu): """Returns a Transformer decoder. This model maps sequential inputs to sequential outputs: - input if `vocab_size` is specified: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - input if `vocab_size` is None: rank 3 tensor representing a batch of activation vectors; shape is (batch_size, sequence_length, `d_model`). - output: rank 3 tensor with shape (batch_size, sequence_length, `d_model`). The model uses causal attention and does *not* shift the input to the right. Thus, the output for position `t` is based on inputs up to and including position `t`. Args: vocab_size: If specified, gives the input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. If None, indicates that the model expects as input floating point vectors, each with `d_model` components. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each decoder block. n_layers: Number of decoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a decoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each decoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each decoder block; must be an activation-type subclass of `Layer`. Returns: If `vocab_size` is defined: a Transformer model that maps strings (conveyed via token IDs) to sequences of activation vectors. If `vocab_size` is None: a Transformer model that maps sequences of activation vectors to sequences of activation vectors. """ positional_encoder = [(tl.Embedding(vocab_size, d_model) if vocab_size is not None else tl.Dense(d_model)), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len)] decoder_blocks = [ # pylint: disable=g-complex-comprehension _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_layers) ] # Assemble and return the model. return tl.Serial( # toks positional_encoder, # vecs decoder_blocks, # vecs tl.LayerNorm(), # vecs )
def LayerDropSkippingTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train', ff_activation=tl.Relu, skip_fraction=0.4): """Returns a Skipping Transformer language model. The input to the model is a tensor of tokens. (This model uses only the decoder part of the overall Transformer.) Args: vocab_size: int: vocab size d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference ff_activation: the non-linearity in feed-forward layer skip_fraction: fraction of times to skip some layers Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ embedder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode), ] def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding, n_layers_to_keep tl.Select([1, 0, 1]), # n_layers_to_keep, embedding, n_layers_to_keep tl.Cond( # if n_layers_to_keep > current_layer_num LargerThan(float(current_layer_num)), # then: run block tl.Serial(transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)), # else: run noop tl.Serial() ) # stack: embedding, n_layers_to_keep ) if mode == 'train': minimum_layers = 0.0 maximum_layers = float(n_layers) / skip_fraction else: minimum_layers = maximum_layers = float(n_layers) return tl.Serial( tl.ShiftRight(mode=mode), embedder, # stack: embedding tl.RandomUniform(minimum_layers, maximum_layers, sync=True), # stack: n_layers_to_keep, embedding tl.Swap(), # stack: embedding, n_layers_to_keep [ConditionedBlock(i) for i in range(n_layers)], # stack: embedding, n_layers_to_keep tl.Select([0], n_in=2), # stack: embedding tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax(), )
def TransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu): """Returns a Transformer language model. This model performs autoregressive language modeling: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 3 tensor representing a batch of log-probability distributions for each sequence position over possible token IDs; shape is (batch_size, sequence_length, `vocab_size`). This model uses only the decoder part of the overall Transformer. Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'predict'`, use fast inference. If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode) ] decoder_blocks = [ # pylint: disable=g-complex-comprehension _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_layers) ] # Assemble and return the model. return tl.Serial( # tokens (or chunked tuple of tokens) tl.ShiftRight(mode=mode), # toks positional_encoder, # vecs decoder_blocks, # vecs tl.LayerNorm(), # vecs tl.Dense(vocab_size), # vecs )
def test_new_weights(self): layer = tl.Dropout(rate=0.1, mode='train') layer.init(None) self.assertEmpty(layer.weights)
def TransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu): """Returns a Transformer encoder merged with an N-way categorization head. This model performs text categorization: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 2 tensor representing a batch of log-probability distributions over N categories; shape is (batch_size, `n_classes`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Final dimension of the output tensors, representing N-way classification. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. Returns: A Transformer model that maps strings (conveyed via token IDs) to probability-like activations over a range of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len) ] encoder_blocks = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_layers) ] # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. tl.Mean(axis=1), # vecs tl.Dense(n_classes), # vecs )
def ConfigurableTransformerEncoder(vocab_size, n_classes=10, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu, ff_dropout=0.1, ff_chunk_size=0, ff_use_sru=0, ff_sparsity=0, ff_sparsity_type='1inN', attention_chunk_size=0, attention_type=tl.Attention, pos_type=None, pos_axial_shape=None, pos_d_axial_embs=None): """Returns a Transformer encoder merged with an N-way categorization head. This model performs text categorization: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 2 tensor representing a batch of log-probability distributions over N categories; shape is (batch_size, `n_classes`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. n_classes: Final dimension of the output tensors, representing N-way classification. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` attention_chunk_size: int, if > 0 run attention chunked at this size attention_type: The attention layer to use for the encoder part. pos_type: string, the type of positional embeddings to use. pos_axial_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. pos_d_axial_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match pos_axial_shape, and values must sum to d_model. Returns: A Transformer model that maps strings (conveyed via token IDs) to probability-like activations over a range of output classes. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), PositionalEncoder(mode, dropout, max_len, pos_type, pos_axial_shape, pos_d_axial_embs) ] # pylint: disable=g-complex-comprehension encoder_blocks = [ EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, ff_sparsity_type, attention_chunk_size, attention_type) for i in range(n_layers) ] # pylint: enable=g-complex-comprehension # Assemble and return the model. return tl.Serial( # toks # Encode. tl.Branch(positional_encoder, tl.PaddingMask()), # vecs masks encoder_blocks, # vecs masks tl.Select([0], n_in=2), # vecs tl.LayerNorm(), # vecs # Map to output categories. tl.Mean(axis=1), # vecs tl.Dense(n_classes), # vecs )
def PositionalEncoder(vocab_size): # tokens --> vectors return [ tl.Embedding(d_model, vocab_size), tl.Dropout(rate=dropout, mode=mode), tl.PositionalEncoding(max_len=max_len), ]
def ConfigurableTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, max_len=2048, dropout=0.1, dropout_shared_axes=None, mode='train', ff_activation=tl.Relu, ff_dropout=0.1, ff_chunk_size=0, ff_use_sru=0, ff_sparsity=0, ff_sparsity_type='1inN', loss_sparsity_type='mult', loss_sparsity=0, loss_d_lowrank=0, loss_sparsity_prob=None, attention_chunk_size=0, attention_type=tl.CausalAttention, pos_type=None, pos_axial_shape=None, pos_d_axial_embs=None): """Returns a Transformer language model. This model performs autoregressive language modeling: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 3 tensor representing a batch of log-probability distributions for each sequence position over possible token IDs; shape is (batch_size, sequence_length, `vocab_size`). This model uses only the decoder part of the overall Transformer. Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each encoder block. n_layers: Number of encoder blocks. Each block includes attention, dropout, residual, feed-forward (`Dense`), and activation layers. n_heads: Number of attention heads. max_len: Maximum symbol length for positional encoding. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within an encoder block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'predict'`, use fast inference. If `'train'`, each encoder block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each encoder block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` loss_sparsity_type: string, type of sparsity to used in loss layer. See SparseDenseWithOptions for options. None if no sparsity should be used. loss_sparsity: int, the sparsity for loss layer (if used) loss_d_lowrank: int, the dimensions for intermediate layer (if used) loss_sparsity_prob: float, the probability for sparse version of loss to be used. If None, only sparse version is used. attention_chunk_size: int, if > 0 run attention chunked at this size attention_type: The attention layer to use for the decoder part. pos_type: string, the type of positional embeddings to use. pos_axial_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. pos_d_axial_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match pos_axial_shape, and values must sum to d_model. Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ positional_encoder = [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), PositionalEncoder(mode, dropout, max_len, pos_type, pos_axial_shape, pos_d_axial_embs) ] # pylint: disable=g-complex-comprehension decoder_blocks = [ DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, ff_sparsity_type, attention_chunk_size, attention_type) for i in range(n_layers) ] # pylint: enable=g-complex-comprehension # Assemble and return the model. return tl.Serial( # tokens (or chunked tuple of tokens) tl.ShiftRight(mode=mode), # toks positional_encoder, # vecs decoder_blocks, # vecs tl.LayerNorm(), # vecs tl.SparseDenseWithOptions( # vecs vocab_size, d_input=d_model, sparsity_type=loss_sparsity_type, sparsity=loss_sparsity, d_lowrank=loss_d_lowrank, prob_sparse=loss_sparsity_prob, mode=mode), )
def _Dropout(): return tl.Dropout(rate=dropout, mode=mode)
def _Dropout(): return tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode)
def model_fn(mode='train'): return tl.Serial( tl.Dropout(mode=mode, rate=0.1), tl.BatchNorm(mode=mode), models.MLP(layer_widths=(16, 16, n_classes), mode=mode))