def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True, use_bfloat16=False): """Returns a dense (fully connected) layer of width `n_units`. A dense layer maps collections of `R^m` vectors to `R^n`, where `n` (`= n_units`) is fixed at layer creation time, and `m` is set at layer initialization time. Args: n_units: Number of nodes in the layer, also known as the width of the layer. kernel_initializer: Function that creates a matrix of (random) initial connection weights `W` for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights `b` for the layer. use_bias: If `True`, compute an affine map `y = Wx + b`; else compute a linear map `y = Wx`. use_bfloat16: If `True`, use bfloat16 weights instead of the default float32; this can save memory but may (rarely) lead to numerical issues. """ super().__init__(name=f'Dense_{n_units}') self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias self._use_bfloat16 = use_bfloat16
def EinsumDense(d_input, d_output, use_bias): """Returns a reimplementation of Dense layer, using einsum. While this is an equivalent of a Dense layer, it seems to be faster when used in decoding if used with bias (see decoding_timing_test.py ). This layer can be removed when we understand better the reason for the difference in decoding speed. Args: d_input: Dimensionality of the input tensor. d_output: Dimensionality of the output tensor. use_bias: Whether to use bias. """ layers = [ tl.Weights(init.GlorotUniformInitializer(), [d_output, d_input]), tl.Fn( 'EinsumDense', ( lambda kernel, embeds: # pylint: disable=g-long-lambda jnp.einsum('xd,...d->...x', kernel, embeds))) ] if use_bias: layers.extend([ tl.Weights(init.RandomNormalInitializer(1e-6), [d_output]), tl.Add() ]) return tl.Serial(layers)
def init_weights_and_state(self, input_signature): """Randomly initializes the positional encoding vectors. Args: input_signature: :py:class:`ShapeDtype` instance characterizing the input this layer should compute on. """ d_feature = input_signature.shape[-1] if self._d_feature is not None: d_feature = self._d_feature pe = np.zeros((self._max_len, d_feature), dtype=np.float32) position = np.arange(0, self._max_len)[:, np.newaxis] div_term = np.exp( np.arange(0, d_feature, 2) * -(np.log(10000.0) / d_feature)) pe[:, 0::2] = np.sin(position * div_term) pe[:, 1::2] = np.cos(position * div_term) # [self._max_len, d_feature] if self._use_bfloat16: pe = pe.astype(jnp.bfloat16) w = jnp.array(pe) # Trainable parameters, initialized above. if self._d_feature is not None: ff = init.GlorotUniformInitializer()( (d_feature, input_signature.shape[-1]), self.rng) self.weights = w, ff else: self.weights = w if self._mode == 'predict': self.state = jnp.zeros((), dtype=jnp.int32)
def __init__(self, d_ff, n_elements_in_block=32, d_lowrank=64, temperature=0.1, quant_prob=0.3, use_bfloat16=False, big_weights_in_bfloat16=True, mode='train', kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): """Returns a sparse feed-forward block.""" super().__init__(name=f'SparseFF_{d_ff}') self._mode = mode self._use_bfloat16 = use_bfloat16 self._big_weights_in_bfloat16 = big_weights_in_bfloat16 self._d_ff = d_ff self._d_lowrank = d_lowrank # Q: what temperature is actually most useful in training? self._temperature = temperature if mode == 'train' else 0.0 self._quant_prob = quant_prob self._n_elements_in_block = n_elements_in_block self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer # Helper numbers as d_ff will be divided by n_elements_in_block. assert self._d_ff % self._n_elements_in_block == 0 self._d1 = self._d_ff // self._n_elements_in_block self._d2 = self._n_elements_in_block
def __init__(self, filters, kernel_size, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True, padding='VALID'): """Returns a locally-connected conv-like layer. Args: filters: Number of output filters in the convolution. kernel_size: A length of the convolution window. Must be an odd number. kernel_initializer: Function that creates a matrix of (random) initial connection weights `W` for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights `b` for the layer. use_bias: If `True`, the layer uses a bias vector. padding: The type of padding to use; must be 'VALID', 'SAME', or 'WRAP'. """ super().__init__(name=f'LocallyConnected1d_{filters}_{kernel_size}') self._filters = filters self._kernel_size = kernel_size assert self._kernel_size % 2 == 1 # kernel size has to be odd self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias self._padding = padding
def __init__(self, d_feature, vocab_size, kernel_initializer=init.GlorotUniformInitializer()): super(Embedding, self).__init__() self._d_feature = d_feature # feature dimensionality self._vocab_size = vocab_size self._kernel_initializer = kernel_initializer
def __init__(self, n_heads=1, d_model=1024, kernel_initializer=init.GlorotUniformInitializer()): super(ComputeAttentionOutput, self).__init__() self._n_heads = n_heads self._d_model = d_model self._kernel_initializer = kernel_initializer
def __init__(self, n_heads=1, d_head=64, kernel_initializer=init.GlorotUniformInitializer()): super(ComputeAttentionHeads, self).__init__() self._n_heads = n_heads self._d_head = d_head self._kernel_initializer = kernel_initializer
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): super(Dense, self).__init__() self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def __init__(self, kernel_size=3, kernel_initializer=init.GlorotUniformInitializer(), use_bfloat16=False): """Returns a causal depthwise convolution layer.""" super().__init__(n_in=1, n_out=1) self._kernel_size = kernel_size self._kernel_initializer = kernel_initializer self._use_bfloat16 = use_bfloat16
def __init__(self, n_units, forget_bias=1.0, kernel_initializer=initializers.GlorotUniformInitializer(), bias_initializer=initializers.RandomNormalInitializer(1e-6)): super(LSTMCell, self).__init__(n_in=2, n_out=2) self._n_units = n_units self._forget_bias = forget_bias self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def MultiplicativeModularSparseDense(sparsity, d_feature): """Returns a replacement of Dense layer which uses less parameters. The layer uses number of modules equal to `sparsity`. It is a combination of multiplicative dense and locally connected dense layers. Args: sparsity: The sparsity of the layer; the output vector is divided into this number of modules. d_feature: Dimensionality of input and output tensor. """ assert d_feature % sparsity == 0 d_module = d_feature // sparsity return tl.Serial( # Weight below is used for per-head preprocessing of an embedding. tl.Weights(init.RandomNormalInitializer(stddev=0.5), shape=[sparsity, d_feature]), # Weight below is a kernel of multiplicative dense, shared across heads. tl.Weights(init.GlorotUniformInitializer(), [d_feature, d_module]), # Weight below is a kernel of modular dense. tl.Weights( functools.partial(init.GlorotUniformInitializer(), nonreceptive_dims=[0]), [sparsity, d_module, d_module]), # To save memory the per-head preprocessing and multiplying by # kernels is done in a single einsum. tl.Fn( 'SparseDenseEinsum', ( lambda kmod, kmult, multiplier, embeds: # pylint: disable=g-long-lambda jnp.einsum('hxo,dx,hd,...d->...ho', kmod, kmult, multiplier, embeds))), MergeLastTwoAxes(), # Weight below is bias after dense, per-head. tl.Weights(init.RandomNormalInitializer(1e-6), [d_feature]), tl.Add(), )
def MultiplicativeSparseDense(sparsity, d_input, d_output=None, use_bias=True, use_bfloat16=False): """Returns a replacement of Dense layer which uses less parameters. The layer uses number of modules equal to `sparsity`. It multiplies each dimension of the input tensor by a scalar specific to each dimension and each module separately; then it applies Dense(d_output/sparsity) to each module. Compared to standard dense layer, MultiplicativeSparseDense uses less parameters while still being able to express many interesting functions (for example a permutation). Args: sparsity: The sparsity of the layer; the output vector is divided into this number of modules. d_input: Dimensionality of input tensor. d_output: Dimensionality of output tensor; by default equal to d_input. use_bias: Whether to use bias. use_bfloat16: Whether to use bfloat16 for weights. """ assert d_output % sparsity == 0 d_module = d_output // sparsity layers = [ # Weight below is used for per-head preprocessing of an embedding. tl.Weights(init.RandomNormalInitializer(stddev=0.5), shape=[sparsity, d_input], use_bfloat16=use_bfloat16), # Weight below is dense kernel, shared across heads. tl.Weights(init.GlorotUniformInitializer(), [d_input, d_module], use_bfloat16=use_bfloat16), # To save memory the per-head preprocessing and multiplying by the # kernel is done in the same einsum. tl.Fn( 'AttentionEinsum', ( lambda kernel, multiplier, embeds: # pylint: disable=g-long-lambda jnp.einsum('dx,hd,...d->...hx', kernel, multiplier, embeds))), MergeLastTwoAxes(), ] if use_bias: layers.extend([ # Weight below is bias after dense, per-head. tl.Weights(init.RandomNormalInitializer(1e-6), [d_output], use_bfloat16=use_bfloat16), tl.Add(), ]) return tl.Serial(layers)
def __init__(self, d_ff, num_experts=64, temperature=0.7, mode='train', kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): """Returns a block sparse feed-forward block.""" super().__init__(name=f'BlockSparseFF_{d_ff}') self._mode = mode self._d_ff = d_ff self._num_experts = num_experts self._temperature = temperature if mode == 'train' else 0.0 self._n_elements_in_block = d_ff // num_experts self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer assert self._d_ff % self._num_experts == 0
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6)): """Returns a dense / fully connected layer of width `n_units`. Args: n_units: Number of nodes in the layer, also known as the "width" of the layer. kernel_initializer: Function that creates a matrix of (random) initial connection weights ($$W$$) for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights ($$b$$) for the layer. """ super().__init__() self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer
def LocallyConnectedDense( n_modules, n_units, kernel_size=1, # pylint: disable=invalid-name kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True): """Layer using LocallyConnected1d for approximation of Dense layer. The layer splits the last axis of a tensor into `n_modules`, then runs LocallyConnected1d (grouped convolution) on all those modules, and concatenates their results. It is essentially a locally-sensitive approximation of Dense layer, with number of parameters smaller by the factor of `n_modules / kernel_size`. Args: n_modules: Indicates how many modules (pixels) should be input and output split into for processing. n_units: how many outputs (filters) should each module generate. kernel_size: The size of the kernel to be used. kernel_initializer: Function that creates a matrix of (random) initial connection weights `W` for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights `b` for the layer. use_bias: If `True`, compute an affine map `y = Wx + b`; else compute a linear map `y = Wx`. Returns: LocallyConnectedDense base.Layer. """ if n_modules == 1: return tl.Dense(n_units, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, use_bias=use_bias) return tl.Serial( tl.SplitLastAxis(n_modules), tl.LocallyConnected1d(n_units, kernel_size, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, use_bias=use_bias, padding='WRAP'), tl.MergeLastTwoAxes())
def __init__(self, n_units, kernel_initializer=init.GlorotUniformInitializer(), bias_initializer=init.RandomNormalInitializer(1e-6), use_bias=True): """Returns a dense / fully connected layer of width `n_units`. Args: n_units: Number of nodes in the layer, also known as the "width" of the layer. kernel_initializer: Function that creates a matrix of (random) initial connection weights ($$W$$) for the layer. bias_initializer: Function that creates a vector of (random) initial bias weights ($$b$$) for the layer. use_bias: If True, compute an affine map: $$y = W x + b$$; else compute a linear map: $$y = W x$$. """ super().__init__(name=f'Dense_{n_units}') self._n_units = n_units self._kernel_initializer = kernel_initializer self._bias_initializer = bias_initializer self._use_bias = use_bias
def test_glorot_uniform(self): initializer = initializers.GlorotUniformInitializer() input_shape = (29, 5, 7, 20) init_value = initializer(input_shape, random.get_prng(0)) self.assertEqual(tuple(init_value.shape), input_shape)