def MultiplicativeSparseDense(sparsity, d_input, d_output=None, use_bias=True, use_bfloat16=False): """Returns a replacement of Dense layer which uses less parameters. The layer uses number of modules equal to `sparsity`. It multiplies each dimension of the input tensor by a scalar specific to each dimension and each module separately; then it applies Dense(d_output/sparsity) to each module. Compared to standard dense layer, MultiplicativeSparseDense uses less parameters while still being able to express many interesting functions (for example a permutation). Args: sparsity: The sparsity of the layer; the output vector is divided into this number of modules. d_input: Dimensionality of input tensor. d_output: Dimensionality of output tensor; by default equal to d_input. use_bias: Whether to use bias. use_bfloat16: Whether to use bfloat16 for weights. """ assert d_output % sparsity == 0 d_module = d_output // sparsity layers = [ # Weight below is used for per-head preprocessing of an embedding. tl.Weights(init.RandomNormalInitializer(stddev=0.5), shape=[sparsity, d_input], use_bfloat16=use_bfloat16), # Weight below is dense kernel, shared across heads. tl.Weights(init.GlorotUniformInitializer(), [d_input, d_module], use_bfloat16=use_bfloat16), # To save memory the per-head preprocessing and multiplying by the # kernel is done in the same einsum. tl.Fn( 'AttentionEinsum', ( lambda kernel, multiplier, embeds: # pylint: disable=g-long-lambda jnp.einsum('dx,hd,...d->...hx', kernel, multiplier, embeds))), MergeLastTwoAxes(), ] if use_bias: layers.extend([ # Weight below is bias after dense, per-head. tl.Weights(init.RandomNormalInitializer(1e-6), [d_output], use_bfloat16=use_bfloat16), tl.Add(), ]) return tl.Serial(layers)
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True, use_bfloat16=False): """Returns a dense (fully connected) layer of width `n_units`. A dense layer maps collections of `R^m` vectors to `R^n`, where `n` (`= n_units`) is fixed at layer creation time, and `m` is set at layer initialization time. Args: n_units: Number of nodes in the layer, also known as the width of the layer. kernel_initializer: Function that creates a matrix of (random) initial connection weights `W` for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights `b` for the layer. use_bias: If `True`, compute an affine map `y = Wx + b`; else compute a linear map `y = Wx`. use_bfloat16: If `True`, use bfloat16 weights instead of the default float32; this can save memory but may (rarely) lead to numerical issues. """ super().__init__(name=f'Dense_{n_units}') self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias self._use_bfloat16 = use_bfloat16
def Conv1d(filters, kernel_size, stride=1, padding='VALID', kernel_initializer=None, bias_initializer=init.RandomNormalInitializer(1e-6)): return Conv(filters, (kernel_size,), strides=(stride,), padding=padding, dimension_numbers=('NWC', 'WIO', 'NWC'), kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
def __init__(self, filters, kernel_size, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True, padding='VALID'): """Returns a locally-connected conv-like layer. Args: filters: Number of output filters in the convolution. kernel_size: A length of the convolution window. Must be an odd number. kernel_initializer: Function that creates a matrix of (random) initial connection weights `W` for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights `b` for the layer. use_bias: If `True`, the layer uses a bias vector. padding: The type of padding to use; must be 'VALID', 'SAME', or 'WRAP'. """ super().__init__(name=f'LocallyConnected1d_{filters}_{kernel_size}') self._filters = filters self._kernel_size = kernel_size assert self._kernel_size % 2 == 1 # kernel size has to be odd self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias self._padding = padding
def __init__(self, d_ff, n_elements_in_block=32, d_lowrank=64, temperature=0.1, quant_prob=0.3, use_bfloat16=False, big_weights_in_bfloat16=True, mode='train', kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): """Returns a sparse feed-forward block.""" super().__init__(name=f'SparseFF_{d_ff}') self._mode = mode self._use_bfloat16 = use_bfloat16 self._big_weights_in_bfloat16 = big_weights_in_bfloat16 self._d_ff = d_ff self._d_lowrank = d_lowrank # Q: what temperature is actually most useful in training? self._temperature = temperature if mode == 'train' else 0.0 self._quant_prob = quant_prob self._n_elements_in_block = n_elements_in_block self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer # Helper numbers as d_ff will be divided by n_elements_in_block. assert self._d_ff % self._n_elements_in_block == 0 self._d1 = self._d_ff // self._n_elements_in_block self._d2 = self._n_elements_in_block
def EinsumDense(d_input, d_output, use_bias): """Returns a reimplementation of Dense layer, using einsum. While this is an equivalent of a Dense layer, it seems to be faster when used in decoding if used with bias (see decoding_timing_test.py ). This layer can be removed when we understand better the reason for the difference in decoding speed. Args: d_input: Dimensionality of the input tensor. d_output: Dimensionality of the output tensor. use_bias: Whether to use bias. """ layers = [ tl.Weights(init.GlorotUniformInitializer(), [d_output, d_input]), tl.Fn( 'EinsumDense', ( lambda kernel, embeds: # pylint: disable=g-long-lambda jnp.einsum('xd,...d->...x', kernel, embeds))) ] if use_bias: layers.extend([ tl.Weights(init.RandomNormalInitializer(1e-6), [d_output]), tl.Add() ]) return tl.Serial(layers)
def __init__(self, d_feature, vocab_size, kernel_initializer=init.RandomNormalInitializer(1.0)): """Returns an embedding layer with given vocabulary size and vector size. The layer clips input values (token ids) to the range `[0, vocab_size)`. That is, negative token ids all clip to `0` before being mapped to a vector, and token ids with value `vocab_size` or greater all clip to `vocab_size - 1` before being mapped to a vector. In effect, both id `0` and id `vocab_size - 1` are potentially overloaded as out-of-vocabulary token ids. TODO(jonni): Is this the behavior we want going forward? Args: d_feature: Dimensionality/depth of the output vectors. vocab_size: Size of the input vocabulary. The layer will assign a unique vector to each id in `range(vocab_size)`. kernel_initializer: Function that creates (random) initial vectors for the embedding. """ super().__init__() self._d_feature = d_feature # feature dimensionality self._vocab_size = vocab_size self._kernel_initializer = kernel_initializer
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): super(Dense, self).__init__() self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def __init__(self, d_feature, vocab_size, kernel_initializer=init.RandomNormalInitializer(1.0)): super(Embedding, self).__init__() self._d_feature = d_feature # feature dimensionality self._vocab_size = vocab_size self._kernel_initializer = kernel_initializer
def __init__(self, base=16, n_digits=2, mode='train', initializer=init.RandomNormalInitializer(1e-6)): super(FixedBasePositionalEncoding, self).__init__() self._base = base self._n_digits = n_digits self._mode = mode self._initializer = initializer
def __init__(self, n_units, forget_bias=0.0, kernel_initializer=initializers.RandomUniformInitializer(0.01), bias_initializer=initializers.RandomNormalInitializer(1e-6)): super().__init__(n_in=2, n_out=2) self._n_units = n_units self._forget_bias = forget_bias self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def __init__(self, n_units, forget_bias=1.0, kernel_initializer=initializers.GlorotUniformInitializer(), bias_initializer=initializers.RandomNormalInitializer(1e-6)): super(LSTMCell, self).__init__(n_in=2, n_out=2) self._n_units = n_units self._forget_bias = forget_bias self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def _get_rel_att_inputs(d_model, n_heads): # pylint: disable=invalid-name """Global relative attentions bias initialization shared across the layers.""" assert d_model % n_heads == 0 and d_model % 2 == 0 d_head = d_model // n_heads bias_initializer = init.RandomNormalInitializer(1e-6) context_bias_layer = core.Weights(bias_initializer, shape=(1, n_heads, 1, d_head)) location_bias_layer = core.Weights(bias_initializer, shape=(1, n_heads, 1, d_head)) return context_bias_layer, location_bias_layer
def test_custom_initializer_shape(self): layer = tl.Weights( lambda shape, rng: jnp.zeros(shape, dtype=jnp.float32), (2, 2)) layer.init(()) y = layer(()) self.assertEqual(y.tolist(), [[0., 0.], [0., 0.]]) layer = tl.Weights(init.RandomNormalInitializer(), (2, 2)) layer.init(()) y = layer(()) self.assertEqual(y.shape, (2, 2)) self.assertNotEqual(y.tolist(), [[0., 0.], [0., 0.]])
def __init__(self, filters, kernel_width=3, kernel_initializer=None, bias_initializer=init.RandomNormalInitializer(1e-6)): super(CausalConv, self).__init__(filters=filters, kernel_size=(kernel_width, ), strides=None, padding='VALID', dimension_numbers=('NWC', 'WIO', 'NWC'), kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
def MultiplicativeModularSparseDense(sparsity, d_feature): """Returns a replacement of Dense layer which uses less parameters. The layer uses number of modules equal to `sparsity`. It is a combination of multiplicative dense and locally connected dense layers. Args: sparsity: The sparsity of the layer; the output vector is divided into this number of modules. d_feature: Dimensionality of input and output tensor. """ assert d_feature % sparsity == 0 d_module = d_feature // sparsity return tl.Serial( # Weight below is used for per-head preprocessing of an embedding. tl.Weights(init.RandomNormalInitializer(stddev=0.5), shape=[sparsity, d_feature]), # Weight below is a kernel of multiplicative dense, shared across heads. tl.Weights(init.GlorotUniformInitializer(), [d_feature, d_module]), # Weight below is a kernel of modular dense. tl.Weights( functools.partial(init.GlorotUniformInitializer(), nonreceptive_dims=[0]), [sparsity, d_module, d_module]), # To save memory the per-head preprocessing and multiplying by # kernels is done in a single einsum. tl.Fn( 'SparseDenseEinsum', ( lambda kmod, kmult, multiplier, embeds: # pylint: disable=g-long-lambda jnp.einsum('hxo,dx,hd,...d->...ho', kmod, kmult, multiplier, embeds))), MergeLastTwoAxes(), # Weight below is bias after dense, per-head. tl.Weights(init.RandomNormalInitializer(1e-6), [d_feature]), tl.Add(), )
def __init__(self, shape=(64, 64, 3), d_embs=(384, 384, 256), kernel_initializer=init.RandomNormalInitializer(1.0), dropout=0.0, dropout_broadcast_dims=(), mode='train'): super().__init__() self._kernel_initializer = kernel_initializer assert len(shape) == len(d_embs) self._shape = shape self._d_embs = d_embs if dropout >= 1.0: raise ValueError('Dropout rates must be lower than 1.') if mode == 'train': self._dropout = dropout else: self._dropout = 0.0 self._dropout_broadcast_dims = dropout_broadcast_dims self._mode = mode
def __init__(self, d_ff, num_experts=64, temperature=0.7, mode='train', kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): """Returns a block sparse feed-forward block.""" super().__init__(name=f'BlockSparseFF_{d_ff}') self._mode = mode self._d_ff = d_ff self._num_experts = num_experts self._temperature = temperature if mode == 'train' else 0.0 self._n_elements_in_block = d_ff // num_experts self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer assert self._d_ff % self._num_experts == 0
def __init__(self, filters, kernel_size, strides=None, padding='VALID', dimension_numbers=('NHWC', 'HWIO', 'NHWC'), kernel_initializer=None, bias_initializer=init.RandomNormalInitializer(1e-6)): super().__init__() self._filters = filters self._kernel_size = kernel_size self._padding = padding self._dimension_numbers = dimension_numbers self._lhs_spec, self._rhs_spec, self._out_spec = dimension_numbers self._one = (1,) * len(kernel_size) self._strides = strides or self._one self._bias_initializer = bias_initializer rhs_spec = self._rhs_spec self._kernel_initializer = kernel_initializer if kernel_initializer is None: self._kernel_initializer = init.GlorotNormalInitializer( rhs_spec.index('O'), rhs_spec.index('I'))
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): """Returns a dense / fully connected layer of width `n_units`. Args: n_units: Number of nodes in the layer, also known as the "width" of the layer. kernel_initializer: Function that creates a matrix of (random) initial connection weights ($$W$$) for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights ($$b$$) for the layer. """ super().__init__() self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def LocallyConnectedDense( n_modules, n_units, kernel_size=1, # pylint: disable=invalid-name kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True): """Layer using LocallyConnected1d for approximation of Dense layer. The layer splits the last axis of a tensor into `n_modules`, then runs LocallyConnected1d (grouped convolution) on all those modules, and concatenates their results. It is essentially a locally-sensitive approximation of Dense layer, with number of parameters smaller by the factor of `n_modules / kernel_size`. Args: n_modules: Indicates how many modules (pixels) should be input and output split into for processing. n_units: how many outputs (filters) should each module generate. kernel_size: The size of the kernel to be used. kernel_initializer: Function that creates a matrix of (random) initial connection weights `W` for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights `b` for the layer. use_bias: If `True`, compute an affine map `y = Wx + b`; else compute a linear map `y = Wx`. Returns: LocallyConnectedDense base.Layer. """ if n_modules == 1: return tl.Dense(n_units, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, use_bias=use_bias) return tl.Serial( tl.SplitLastAxis(n_modules), tl.LocallyConnected1d(n_units, kernel_size, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, use_bias=use_bias, padding='WRAP'), tl.MergeLastTwoAxes())
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True): """Returns a dense / fully connected layer of width `n_units`. Args: n_units: Number of nodes in the layer, also known as the "width" of the layer. kernel_initializer: Function that creates a matrix of (random) initial connection weights ($$W$$) for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights ($$b$$) for the layer. use_bias: If True, compute an affine map: $$y = W x + b$$; else compute a linear map: $$y = W x$$. """ super().__init__(name=f'Dense_{n_units}') self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias
def __init__(self, vocab_size, d_feature, kernel_initializer=init.RandomNormalInitializer(1.0)): """Returns an embedding layer with given vocabulary size and vector size. The layer clips input values (token ids) to the range `[0, vocab_size)`. That is, negative token ids all clip to `0` before being mapped to a vector, and token ids with value `vocab_size` or greater all clip to `vocab_size - 1` before being mapped to a vector. Args: vocab_size: Size of the input vocabulary. The layer will assign a unique vector to each id in `range(vocab_size)`. d_feature: Dimensionality/depth of the output vectors. kernel_initializer: Function that creates (random) initial vectors for the embedding. """ # TODO(jonni): is the clipping behavior what we want going forward? super().__init__(name=f'Embedding_{vocab_size}_{d_feature}') self._d_feature = d_feature # feature dimensionality self._vocab_size = vocab_size self._kernel_initializer = kernel_initializer
def RelativeAttentionLayer(d_feature, total_kv_pooling, n_heads=1, dropout=0.0, n_raw_tokens_generated=1, max_inference_length=3072, chunk_len=None, chunk_offset=None, mode='train'): """Returns a layer that maps (q, k, v, masks) to (activations, masks). When number of keys is smaller than number of queries layer works in O(q^2*d). Otherwise it is O(q*k*d). That is because we need to shift relative distances by current_pooling. When we upsample this is current pooling is a fraction < 1 Visual explanation: [01][23][45][67] -> [0][1][2][3][4][5][6][7] For token [0] we calculate relative distances as follows: * 0 2 4 6 However for token [1] we need relative distances changed by 1, specifically: * -1 1 3 5 So we not only need to calculate the distances that corresponds to spacing between the keys but also for the ones in between because there are more than one query tokens (on different positions which means different relative distances) for single key token. Args: d_feature: Depth/dimensionality of feature embedding. total_kv_pooling: Accumulated pool size of keys/values used at this layer. n_heads: Number of attention heads. dropout: Probabilistic rate for internal dropout applied to attention activations (based on query-key pairs) before dotting them with values. n_raw_tokens_generated: Number of tokens generated in a single pass through this layer. Used only in 'predict' non-training mode. max_inference_length: Maximum sequence length allowed in non-training modes. chunk_len (optional): Number of tokens per chunk. Setting this option will enable chunked attention. chunk_offset (optional): Offset for shifting chunks, for shifted chunked attention mode: One of `'train'`, `'eval'`, or `'predict'`. """ pos_emb = PositionalEmbeddings( d_feature, total_kv_pooling, max_inference_length=max_inference_length, chunk_len=chunk_len, chunk_offset=chunk_offset, n_raw_tokens_generated=n_raw_tokens_generated, mode=mode) attention = RelativeAttention( # pylint: disable=no-value-for-parameter total_kv_pooling=total_kv_pooling, n_heads=n_heads, dropout=dropout, n_raw_tokens_generated=n_raw_tokens_generated, max_inference_length=max_inference_length, chunk_len=chunk_len, chunk_offset=chunk_offset, mode=mode), assert d_feature % n_heads == 0 d_head = d_feature // n_heads context_bias_layer = core.Weights( init.RandomNormalInitializer(1e-6), shape=(1, n_heads, 1, d_head)) location_bias_layer = core.Weights( init.RandomNormalInitializer(1e-6), shape=(1, n_heads, 1, d_head)) return cb.Serial( cb.Branch( cb.Serial(pos_emb, core.Dense(d_feature)), core.Dense(d_feature), core.Dense(d_feature), core.Dense(d_feature), cb.Select([1]) # mask ), context_bias_layer, location_bias_layer, attention, core.Dense(d_feature), )
def test_shape(self): layer = tl.Weights(init.RandomNormalInitializer(), (5, 10, 3)) layer.init(()) y = layer(()) self.assertEqual(y.shape, (5, 10, 3))
def test_simple_custom_initializer(self): layer = tl.Weights(init.RandomNormalInitializer()) layer.init(()) y = layer(()) self.assertEqual(y.shape, ()) self.assertNotEqual(y.tolist(), 0.)
def __init__(self, initializer=init.RandomNormalInitializer(0.01)): super(ShiftRightLearned, self).__init__() self._initializer = initializer
def test_random_normal(self): initializer = initializers.RandomNormalInitializer() input_shape = (29, 5, 7, 20) init_value = initializer(input_shape, random.get_prng(0)) self.assertEqual(tuple(init_value.shape), input_shape)