def SkippingTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, d_attention_key=None, d_attention_value=None, attention_type=tl.DotProductCausalAttention, dropout=0.1, share_qk=False, max_len=2048, mode='train', ff_activation=tl.Relu): """Returns a Skipping Transformer language model. The input to the model is a tensor of tokens. (This model uses only the decoder part of the overall Transformer.) Args: vocab_size: int: vocab size d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads d_attention_key: int: depth of key vector for each attention head (default is d_model // n_heads) d_attention_value: int: depth of value vector for each attention head (default is d_model // n_heads) attention_type: subclass of tl.BaseCausalAttention: attention class to use dropout: float: dropout rate (how much to drop out) share_qk: bool, whether to share queries and keys in decoder attention max_len: int: maximum symbol length for positional encoding mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference ff_activation: the non-linearity in feed-forward layer Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ embedder = [ tl.Embedding(d_model, vocab_size), tl.Dropout(rate=dropout, name='embedding', mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode), ] return tl.Serial( tl.ShiftRight(mode=mode), embedder, SkippingSerial( [ transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, d_attention_key, d_attention_value, attention_type, dropout, share_qk, i, mode, ff_activation) for i in range(n_layers) ], mode=mode), tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax(), )
def SkippingTransformerLM(vocab_size, d_model=512, d_ff=2048, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, mode='train', ff_activation=tl.Relu): """Returns a Skipping Transformer language model. The input to the model is a tensor of tokens. (This model uses only the decoder part of the overall Transformer.) Args: vocab_size: int: vocab size d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_layers: int: number of encoder/decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference ff_activation: the non-linearity in feed-forward layer Returns: A Transformer language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ embedder = [ tl.Embedding(d_model, vocab_size), tl.Dropout(rate=dropout, name='embedding', mode=mode), tl.PositionalEncoding(max_len=max_len, mode=mode), ] return tl.Serial( tl.ShiftRight(mode=mode), embedder, SkippingSerial( [ transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, i, mode, ff_activation) for i in range(n_layers) ], mode=mode), tl.LayerNorm(), tl.Dense(vocab_size), tl.LogSoftmax(), )
def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding, n_layers_to_keep tl.Select([1, 0, 1]), # n_layers_to_keep, embedding, n_layers_to_keep tl.Cond( # if n_layers_to_keep > current_layer_num LargerThan(float(current_layer_num)), # then: run block tl.Serial(transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)), # else: run noop tl.Serial() ) # stack: embedding, n_layers_to_keep )
def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding, n_layers_to_keep tl.Select([1, 0, 1]), # n_layers_to_keep, embedding, n_layers_to_keep tl.Cond( # if random() > skip_fraction OR layer not in skip_mode ... LargerThan(skip_fraction if skip_mode_fun(current_layer_num) else 0.0), # then: run block tl.Serial(transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)) # else: noop (implicit) ) # stack: embedding, n_layers_to_keep )
def ConditionedBlock(current_layer_num): return tl.Serial( # stack: embedding tl.RandomUniform(0., 1, sync=True), # stack: random_uniform, embedding tl.Cond( # if random_uniform > skip_fraction LargerThan(skip_fraction[current_layer_num] if mode == 'train' else 0.0), # then: run block tl.Serial(transformer._DecoderBlock( # pylint: disable=g-complex-comprehension,protected-access d_model, d_ff, n_heads, dropout, [], mode, ff_activation)), # else: run noop tl.Serial() ) # stack: embedding )
def TransformerNoEncDecAttention(input_vocab_size, output_vocab_size=None, d_model=512, d_ff=2048, n_encoder_layers=6, n_decoder_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu): """Returns a Transformer model. This model expects an input pair: target, source. Args: input_vocab_size: int: vocab size of the source. output_vocab_size: int (optional): vocab size of the target. If None, the source and target are assumed to have the same vocab. d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_encoder_layers: int: number of encoder layers n_decoder_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) dropout_shared_axes: axes on which to share dropout mask max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' ff_activation: the non-linearity in feed-forward layer Returns: A Transformer model as a layer that maps from a target, source pair to activations over a vocab set. """ def PositionalEncoder(vocab_size): # tokens --> vectors return [ tl.Embedding(vocab_size, d_model), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode), tl.PositionalEncoding(max_len=max_len), ] in_encoder = PositionalEncoder(input_vocab_size) out_encoder = (in_encoder if output_vocab_size is None else PositionalEncoder(output_vocab_size)) if output_vocab_size is None: output_vocab_size = input_vocab_size encoder_blocks = [ transformer._EncoderBlock(d_model, d_ff, n_heads, dropout, # pylint: disable=protected-access dropout_shared_axes, mode, ff_activation) for i in range(n_encoder_layers)] encoder = tl.Serial( in_encoder, encoder_blocks, tl.LayerNorm() ) if mode == 'predict': encoder = tl.Cache(encoder) decoder_blocks = [ transformer._DecoderBlock(d_model, d_ff, n_heads, dropout, # pylint: disable=protected-access dropout_shared_axes, mode, ff_activation) for i in range(n_decoder_layers)] # pylint: disable=protected-access # Assemble and return the model. return tl.Serial( # Input: encoder_side_tokens, decoder_side_tokens # Copy decoder tokens for use in loss. tl.Select([0, 0, 1, 1]), # tok_e tok_e tok_d tok_d # Encode. tl.Branch([], tl.PaddingMask()), # tok_e mask_e tok_e tok_d tok_d encoder, # vec_e mask_e tok_e tok_d tok_d # Simple encoder mask, doesn't contain extra dims. tl.Select([2, 0, 2], n_in=3), # tok_e vec_e tok_e tok_d tok_d transformer._MaskOfRightShiftedArray( n_positions=0), # mask_e vec_e tok_e tok_d tok_d # Decode. tl.Select([3, 1, 0, 2]), # tok_d vec_e mask_e tok_e tok_d tl.ShiftRight(mode=mode), # stok_d vec_e mask_e tok_e tok_d tl.Branch( [], transformer._MaskOfRightShiftedArray() ), # stok_d mask_d vec_e mask_e tok_e tok_d out_encoder, # svec_d mask_d vec_e mask_e tok_e tok_d # Concat encoder and decoder. tl.Select([2, 0, 3, 1]), # vec_e svec_d mask_e mask_d tok_e tok_d transformer._ConcatWithPadding(), # vec_ed tok_e tok_d # Decoder blocks with causal attention decoder_blocks, # vec_ed tok_e tok_d tl.LayerNorm(), # vec_ed tok_e tok_d # Separate out the encoder part from the concatenated vector. tl.Select([0, 1, 2, 2]), # vec_ed tok_e tok_d tok_d transformer._StripFromConcatenateWithPadding(), # vec_d tok_d # Map to output vocab. tl.Dense(output_vocab_size), # vec_d tok_d tl.LogSoftmax(), # vec_d tok_d )
def Transformer2(input_vocab_size, output_vocab_size=None, d_model=512, d_ff=2048, n_encoder_layers=6, n_decoder_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu, axial_pos_shape=None, d_axial_pos_embs=None): """Returns a Transformer model. This model expects an input pair: target, source. Args: input_vocab_size: int: vocab size of the source. output_vocab_size: int (optional): vocab size of the target. If None, the source and target are assumed to have the same vocab. d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_encoder_layers: int: number of encoder layers n_decoder_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) dropout_shared_axes: axes on which to share dropout mask max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' ff_activation: the non-linearity in feed-forward layer axial_pos_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. d_axial_pos_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match axial_pos_shape, and values must sum to d_model. Returns: A Transformer model as a layer that maps from a target, source pair to activations over a vocab set. """ in_encoder, out_encoder, output_vocab_size = ( ct.EmbeddingAndPositionalEncodings( input_vocab_size, d_model, mode, dropout, dropout_shared_axes, max_len, output_vocab_size=output_vocab_size, axial_pos_shape=axial_pos_shape, d_axial_pos_embs=d_axial_pos_embs) ) encoder_blocks = [ transformer._EncoderBlock(d_model, d_ff, n_heads, dropout, # pylint: disable=protected-access dropout_shared_axes, mode, ff_activation) for i in range(n_encoder_layers)] encoder = tl.Serial( in_encoder, encoder_blocks, tl.LayerNorm() ) if mode == 'predict': encoder = tl.Cache(encoder) decoder_blocks = [ transformer._DecoderBlock(d_model, d_ff, n_heads, dropout, # pylint: disable=protected-access dropout_shared_axes, mode, ff_activation) for i in range(n_decoder_layers)] # pylint: disable=protected-access # Assemble and return the model. return tl.Serial( # Input: encoder_side_tokens, decoder_side_tokens # Copy decoder tokens for use in loss. tl.Select([0, 0, 1, 1]), # tok_e tok_e tok_d tok_d # Encode. tl.Branch([], tl.PaddingMask()), # tok_e mask_e tok_e tok_d tok_d encoder, # vec_e mask_e tok_e tok_d tok_d # Simple encoder mask, doesn't contain extra dims. tl.Select([2, 0, 2], n_in=3), # tok_e vec_e tok_e tok_d tok_d tl.Fn('EncoderMask', # mask_e vec_e tok_e tok_d tok_d lambda x: x != 0, n_out=1), # Decode. tl.Select([3, 1, 0, 2]), # tok_d vec_e mask_e tok_e tok_d tl.ShiftRight(mode=mode), # stok_d vec_e mask_e tok_e tok_d out_encoder, # svec_d vec_e mask_e tok_e tok_d # Concat encoder and decoder. tl.Select([1, 0]), # vec_e svec_d mask_e tok_e tok_d ConcatWithPadding(mode=mode), # vec_ed tok_e tok_d # Decoder blocks with causal attention decoder_blocks, # vec_ed tok_e tok_d tl.LayerNorm(), # vec_ed tok_e tok_d # Separate out the encoder part from the concatenated vector. tl.Select([0, 1, 2, 2]), # vec_ed tok_e tok_d tok_d StripFromConcatenateWithPadding(mode=mode), # vec_d tok_d # Map to output vocab. tl.Dense(output_vocab_size), # vec_d tok_d tl.LogSoftmax(), # vec_d tok_d )