def attention(*args, **kwargs): # number of input positions to remember in a cache when doing fast inference. kwargs['predict_mem_len'] = 120 # number of input elements to drop once the fast inference input cache fills up. kwargs['predict_drop_len'] = 120 # return the attention layer with the parameters defined above return tl.SelfAttention(*args, **kwargs)
def EncoderDecoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, ff_dropout, mode, ff_use_sru=0, ff_chunk_size=0, ff_sparsity=0): """Reversible transformer decoder layer. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer ff_dropout: float: (optional) separate dropout rate for feed-forward layer mode: str: 'train' or 'eval' ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity Returns: the layer. """ enc_dec_attention = tl.EncDecAttention( n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads, attention_dropout=dropout, output_dropout=dropout, mode=mode) enc_dec_attention_half_residual = tl.ReversibleHalfResidual( tl.LayerNorm(), attention_layer=enc_dec_attention, ) causal_attention = tl.SelfAttention( n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads, causal=True, attention_dropout=dropout, output_dropout=dropout, mode=mode) causal_attention_half_residual = tl.ReversibleHalfResidual( tl.LayerNorm(), attention_layer=causal_attention, ) feed_forward = ct.FeedForwardWithOptions( d_model, d_ff, dropout, [-2], ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode) return [ # vec_d1 vec_d2 vec_e masks causal_attention_half_residual, tl.ReversibleSwap(), enc_dec_attention_half_residual, tl.ReversibleSwap(), tl.ReversibleHalfResidual(feed_forward), tl.ReversibleSwap(), ]
def EncoderDecoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, ff_dropout, mode): """Reversible transformer decoder layer. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer ff_dropout: float: (optional) separate dropout rate for feed-forward layer mode: str: 'train' or 'eval' Returns: the layer. """ enc_dec_attention = tl.EncDecAttention(n_heads=n_heads, d_qk=d_model // n_heads, d_v=d_model // n_heads, attention_dropout=dropout, output_dropout=dropout, mode=mode) enc_dec_attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=enc_dec_attention, ) causal_attention = tl.SelfAttention(n_heads=n_heads, d_qk=d_model // n_heads, d_v=d_model // n_heads, causal=True, attention_dropout=dropout, output_dropout=dropout, mode=mode) causal_attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=causal_attention, ) feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, mode) return [ # vec_d1 vec_d2 vec_e masks causal_attention_half_residual, tl.ReversibleSwap(), enc_dec_attention_half_residual, tl.ReversibleSwap(), ReversibleHalfResidualV2(feed_forward), tl.ReversibleSwap(), ]
def RecommenderTransformer(n_classes_in, embedding_size, n_out_classes, dropout_rate): transfomer = tl.Serial( tl.Embedding(n_classes_in, d_feature=embedding_size), tl.Dropout(dropout_rate), tl.SelfAttention(2), tl.Flatten(), tl.Dropout(dropout_rate), #tl.DotProductCausalAttention(4), tl.Dense(n_out_classes), tl.LogSoftmax()) print(str(transfomer)) return transfomer
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, ff_dropout, mode): """Returns a list of layers that implements a Reformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer ff_dropout: the dropout rate in feed-forward layer mode: str: 'train' or 'eval' Returns: A list of layers that maps (activations, mask) to (activations, mask). """ if mode == 'predict': # Mode 'predict' means that the decoder should be run one token at a time. # The encoder only ever runs over full sequences, which is why it's switched # to 'eval' mode instead. mode = 'eval' attention = tl.SelfAttention(n_heads=n_heads, d_qk=d_model // n_heads, d_v=d_model // n_heads, masked=True, attention_dropout=dropout, output_dropout=dropout, mode=mode) attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=attention, ) feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, mode) return [ attention_half_residual, tl.ReversibleSwap(), ReversibleHalfResidualV2(feed_forward), tl.ReversibleSwap(), ]
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode): """Returns a list of layers that implements a Reformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer mode: str: 'train' or 'eval' Returns: A list of layers that maps (activations, mask) to (activations, mask). """ attention = tl.SelfAttention( n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads, masked=True, attention_dropout=0.0, # TODO(kitaev): attention dropout mode=mode) attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=attention, # TODO(kitaev): add output dropout to attention layer. rate=dropout ) # TODO(kitaev): Switch to FeedForward with BroadcastedDropout? feed_forward = transformer._FeedForwardBlock( # pylint: disable=protected-access d_model, d_ff, dropout, -1, mode, ff_activation) # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode) return [ attention_half_residual, tl.ReversibleSwap(), ReversibleHalfResidualV2(feed_forward), tl.ReversibleSwap(), ]
def BERT(d_model=768, vocab_size=30522, max_len=512, type_vocab_size=2, n_heads=12, d_ff=3072, n_layers=12, head=None, init_checkpoint=None, mode='eval', ): """BERT (default hparams are for bert-base-uncased).""" layer_norm_eps = 1e-12 d_head = d_model // n_heads word_embeddings = tl.Embedding(d_model, vocab_size) type_embeddings = tl.Embedding(d_model, type_vocab_size) position_embeddings = tl.PositionalEncoding(max_len, mode=mode) embeddings = [ tl.Select([0, 1, 0], n_in=3), # Drops 'idx' input. tl.Parallel( word_embeddings, type_embeddings, [tl.PaddingMask(), tl.Fn('Squeeze', lambda x: np.squeeze(x, (1, 2)), n_out=1)] ), tl.Add(), position_embeddings, tl.LayerNorm(epsilon=layer_norm_eps), ] encoder = [] for _ in range(n_layers): attn = tl.SelfAttention(n_heads=n_heads, d_qk=d_head, d_v=d_head, bias=True, masked=True, mode=mode) feed_forward = [ tl.Dense(d_ff), tl.Gelu(), tl.Dense(d_model) ] encoder += [ tl.Select([0, 1, 1]), # Save a copy of the mask tl.Residual(attn, AddBias()), # pylint: disable=no-value-for-parameter tl.LayerNorm(epsilon=layer_norm_eps), tl.Residual(*feed_forward), tl.LayerNorm(epsilon=layer_norm_eps), ] encoder += [tl.Select([0], n_in=2)] # Drop the mask pooler = [ tl.Fn('', lambda x: (x[:, 0, :], x), n_out=2), tl.Dense(d_model), tl.Tanh(), ] init_checkpoint = init_checkpoint if mode == 'train' else None bert = PretrainedBERT( embeddings + encoder + pooler, init_checkpoint=init_checkpoint) if head is not None: bert = tl.Serial(bert, head()) return bert
def attention(*args, **kwargs): kwargs['predict_mem_len'] = 120 # max length for predictions kwargs['predict_drop_len'] = 120 # never drop old stuff return tl.SelfAttention(*args, **kwargs)
def attention(*args, **kwargs): kwargs['predict_mem_len'] = 120 kwargs['predict_drop_len'] = 120 return tl.SelfAttention(*args, **kwargs)