def FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, use_bfloat16=False, ff_sparsity_type='1inN'): """Feed-Forward block with all the options. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, tuple or string; if not 0, use sparse feed-forward block with this sparsity mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. use_bfloat16: whether to use bfloat16 for weights (default: False). ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` use SwitchSparseFF if ff_sparsity_type=`'Switch'` Returns: A list of layers which maps vectors to vectors. """ if ff_sparsity and ff_sparsity_type == '1inN': temperature, quant_prob = 0.1, 0.3 if isinstance(ff_sparsity, str): # This is hacky but used to pass ff_sparsity in yaml sweep files. ff_sparsity = [(float(x) if '.' in x else int(x)) for x in ff_sparsity.split()] if isinstance(ff_sparsity, (list, tuple)): if len(ff_sparsity) == 2: n_elements_in_block, d_lowrank = ff_sparsity else: n_elements_in_block, d_lowrank, temperature, quant_prob = ff_sparsity else: assert isinstance(ff_sparsity, int) n_elements_in_block, d_lowrank = ff_sparsity, d_ff // ff_sparsity ff = tl.SparseFF(d_ff, n_elements_in_block=n_elements_in_block, d_lowrank=d_lowrank, temperature=temperature, quant_prob=quant_prob, use_bfloat16=use_bfloat16, mode=mode, dropout_rate=dropout, dropout_shared_axes=dropout_shared_axes, ff_chunk_size=ff_chunk_size) elif ff_sparsity and ff_sparsity_type == 'Block': ff = tl.BlockSparseFF(d_ff, n_experts=ff_sparsity, mode=mode) elif ff_sparsity and ff_sparsity_type == 'Switch': ff = tl.SwitchSparseFF(d_ff, n_experts=ff_sparsity, mode=mode) else: ff = _FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, use_bfloat16, mode) res = [tl.LayerNorm(), ff] if ff_sparsity_type != '1inN' or ff_sparsity == 0: # SparseFF has Dropout and BatchLeadingAxes built-in. res.append( tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode)) if ff_chunk_size > 0: res = tl.BatchLeadingAxes(tl.Chunk(tl.Serial(res), ff_chunk_size)) if ff_use_sru: if isinstance(ff_use_sru, (list, tuple)): sru_n_layers, sru_n_units = ff_use_sru else: sru_n_layers, sru_n_units = ff_use_sru, 32 sru = [tl.SRU(sru_n_units, mode=mode) for _ in range(sru_n_layers)] block = [tl.LayerNorm(), tl.Dense(sru_n_units) ] + sru + [tl.Dense(d_model)] res = tl.Residual(block, shortcut=res) return [res]
def FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, ff_sparsity_type='1inN'): """Feed-Forward block with all the options. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` Returns: A list of layers which maps vectors to vectors. """ if ff_use_sru: return [tl.SRU(d_model) for _ in range(ff_use_sru)] elif ff_sparsity and ff_sparsity_type == '1inN': ff = tl.SparseFF(d_ff, n_elements_in_block=ff_sparsity, d_lowrank=d_ff // ff_sparsity, mode=mode) if ff_chunk_size < 1: chunked_ff = ff else: chunked_ff = tl.BatchLeadingAxes( tl.Chunk(tl.Serial(ff), ff_chunk_size)) return [ tl.LayerNorm(), chunked_ff, tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] elif ff_sparsity and ff_sparsity_type == 'Block': return [ tl.LayerNorm(), tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] else: return [ ChunkedFeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, ff_chunk_size, mode) ]