Esempio n. 1
0
def MultiplicativeSparseDense(sparsity,
                              d_input,
                              d_output=None,
                              use_bias=True,
                              use_bfloat16=False):
    """Returns a replacement of Dense layer which uses less parameters.

  The layer uses number of modules equal to `sparsity`. It multiplies each
  dimension of the input tensor by a scalar specific to each dimension and each
  module separately; then it applies Dense(d_output/sparsity) to each module.
  Compared to standard dense layer, MultiplicativeSparseDense uses less
  parameters while still being able to express many interesting functions (for
  example a permutation).

  Args:
    sparsity: The sparsity of the layer; the output vector is divided into this
        number of modules.
    d_input: Dimensionality of input tensor.
    d_output: Dimensionality of output tensor; by default equal to d_input.
    use_bias: Whether to use bias.
    use_bfloat16: Whether to use bfloat16 for weights.
  """

    assert d_output % sparsity == 0
    d_module = d_output // sparsity

    layers = [
        # Weight below is used for per-head preprocessing of an embedding.
        tl.Weights(init.RandomNormalInitializer(stddev=0.5),
                   shape=[sparsity, d_input],
                   use_bfloat16=use_bfloat16),
        # Weight below is dense kernel, shared across heads.
        tl.Weights(init.GlorotUniformInitializer(), [d_input, d_module],
                   use_bfloat16=use_bfloat16),
        # To save memory the per-head preprocessing and multiplying by the
        # kernel is done in the same einsum.
        tl.Fn(
            'AttentionEinsum',
            (
                lambda kernel, multiplier, embeds:  # pylint: disable=g-long-lambda
                jnp.einsum('dx,hd,...d->...hx', kernel, multiplier, embeds))),
        MergeLastTwoAxes(),
    ]
    if use_bias:
        layers.extend([
            # Weight below is bias after dense, per-head.
            tl.Weights(init.RandomNormalInitializer(1e-6), [d_output],
                       use_bfloat16=use_bfloat16),
            tl.Add(),
        ])
    return tl.Serial(layers)
Esempio n. 2
0
    def __init__(self,
                 n_units,
                 kernel_initializer=init.GlorotUniformInitializer(),
                 bias_initializer=init.RandomNormalInitializer(1e-6),
                 use_bias=True,
                 use_bfloat16=False):
        """Returns a dense (fully connected) layer of width `n_units`.

    A dense layer maps collections of `R^m` vectors to `R^n`, where `n`
    (`= n_units`) is fixed at layer creation time, and `m` is set at layer
    initialization time.

    Args:
      n_units: Number of nodes in the layer, also known as the width of the
          layer.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights `W` for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights `b` for the layer.
      use_bias: If `True`, compute an affine map `y = Wx + b`; else compute
          a linear map `y = Wx`.
      use_bfloat16: If `True`, use bfloat16 weights instead of the default
        float32; this can save memory but may (rarely) lead to numerical issues.
    """
        super().__init__(name=f'Dense_{n_units}')
        self._n_units = n_units
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
        self._use_bias = use_bias
        self._use_bfloat16 = use_bfloat16
Esempio n. 3
0
def Conv1d(filters, kernel_size, stride=1, padding='VALID',
           kernel_initializer=None,
           bias_initializer=init.RandomNormalInitializer(1e-6)):
  return Conv(filters, (kernel_size,), strides=(stride,), padding=padding,
              dimension_numbers=('NWC', 'WIO', 'NWC'),
              kernel_initializer=kernel_initializer,
              bias_initializer=bias_initializer)
Esempio n. 4
0
    def __init__(self,
                 filters,
                 kernel_size,
                 kernel_initializer=init.GlorotUniformInitializer(),
                 bias_initializer=init.RandomNormalInitializer(1e-6),
                 use_bias=True,
                 padding='VALID'):
        """Returns a locally-connected conv-like layer.

    Args:
      filters: Number of output filters in the convolution.
      kernel_size: A length of the convolution window. Must be an odd number.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights `W` for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights `b` for the layer.
      use_bias: If `True`, the layer uses a bias vector.
      padding: The type of padding to use; must be 'VALID', 'SAME', or 'WRAP'.
    """
        super().__init__(name=f'LocallyConnected1d_{filters}_{kernel_size}')
        self._filters = filters
        self._kernel_size = kernel_size
        assert self._kernel_size % 2 == 1  # kernel size has to be odd
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
        self._use_bias = use_bias
        self._padding = padding
Esempio n. 5
0
 def __init__(self,
              d_ff,
              n_elements_in_block=32,
              d_lowrank=64,
              temperature=0.1,
              quant_prob=0.3,
              use_bfloat16=False,
              big_weights_in_bfloat16=True,
              mode='train',
              kernel_initializer=init.GlorotUniformInitializer(),
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     """Returns a sparse feed-forward block."""
     super().__init__(name=f'SparseFF_{d_ff}')
     self._mode = mode
     self._use_bfloat16 = use_bfloat16
     self._big_weights_in_bfloat16 = big_weights_in_bfloat16
     self._d_ff = d_ff
     self._d_lowrank = d_lowrank
     # Q: what temperature is actually most useful in training?
     self._temperature = temperature if mode == 'train' else 0.0
     self._quant_prob = quant_prob
     self._n_elements_in_block = n_elements_in_block
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     # Helper numbers as d_ff will be divided by n_elements_in_block.
     assert self._d_ff % self._n_elements_in_block == 0
     self._d1 = self._d_ff // self._n_elements_in_block
     self._d2 = self._n_elements_in_block
Esempio n. 6
0
def EinsumDense(d_input, d_output, use_bias):
    """Returns a reimplementation of Dense layer, using einsum.

  While this is an equivalent of a Dense layer, it seems to be faster when used
  in decoding if used with bias (see decoding_timing_test.py ).
  This layer can be removed when we understand better the reason for the
  difference in decoding speed.

  Args:
    d_input: Dimensionality of the input tensor.
    d_output: Dimensionality of the output tensor.
    use_bias: Whether to use bias.
  """
    layers = [
        tl.Weights(init.GlorotUniformInitializer(), [d_output, d_input]),
        tl.Fn(
            'EinsumDense',
            (
                lambda kernel, embeds:  # pylint: disable=g-long-lambda
                jnp.einsum('xd,...d->...x', kernel, embeds)))
    ]
    if use_bias:
        layers.extend([
            tl.Weights(init.RandomNormalInitializer(1e-6), [d_output]),
            tl.Add()
        ])
    return tl.Serial(layers)
Esempio n. 7
0
    def __init__(self,
                 d_feature,
                 vocab_size,
                 kernel_initializer=init.RandomNormalInitializer(1.0)):
        """Returns an embedding layer with given vocabulary size and vector size.

    The layer clips input values (token ids) to the range `[0, vocab_size)`.
    That is, negative token ids all clip to `0` before being mapped to a
    vector, and token ids with value `vocab_size` or greater all clip to
    `vocab_size - 1` before being mapped to a vector. In effect, both id `0`
    and id `vocab_size - 1` are potentially overloaded as out-of-vocabulary
    token ids.

    TODO(jonni): Is this the behavior we want going forward?

    Args:
      d_feature: Dimensionality/depth of the output vectors.
      vocab_size: Size of the input vocabulary. The layer will assign a unique
          vector to each id in `range(vocab_size)`.
      kernel_initializer: Function that creates (random) initial vectors for
          the embedding.
    """
        super().__init__()
        self._d_feature = d_feature  # feature dimensionality
        self._vocab_size = vocab_size
        self._kernel_initializer = kernel_initializer
Esempio n. 8
0
 def __init__(self,
              n_units,
              kernel_initializer=init.GlorotUniformInitializer(),
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     super(Dense, self).__init__()
     self._n_units = n_units
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
Esempio n. 9
0
 def __init__(self,
              d_feature,
              vocab_size,
              kernel_initializer=init.RandomNormalInitializer(1.0)):
     super(Embedding, self).__init__()
     self._d_feature = d_feature  # feature dimensionality
     self._vocab_size = vocab_size
     self._kernel_initializer = kernel_initializer
Esempio n. 10
0
 def __init__(self,
              base=16,
              n_digits=2,
              mode='train',
              initializer=init.RandomNormalInitializer(1e-6)):
     super(FixedBasePositionalEncoding, self).__init__()
     self._base = base
     self._n_digits = n_digits
     self._mode = mode
     self._initializer = initializer
Esempio n. 11
0
 def __init__(self,
              n_units,
              forget_bias=0.0,
              kernel_initializer=initializers.RandomUniformInitializer(0.01),
              bias_initializer=initializers.RandomNormalInitializer(1e-6)):
   super().__init__(n_in=2, n_out=2)
   self._n_units = n_units
   self._forget_bias = forget_bias
   self._kernel_initializer = kernel_initializer
   self._bias_initializer = bias_initializer
Esempio n. 12
0
 def __init__(self,
              n_units,
              forget_bias=1.0,
              kernel_initializer=initializers.GlorotUniformInitializer(),
              bias_initializer=initializers.RandomNormalInitializer(1e-6)):
     super(LSTMCell, self).__init__(n_in=2, n_out=2)
     self._n_units = n_units
     self._forget_bias = forget_bias
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
Esempio n. 13
0
def _get_rel_att_inputs(d_model, n_heads):  # pylint: disable=invalid-name
  """Global relative attentions bias initialization shared across the layers."""
  assert d_model % n_heads == 0 and d_model % 2 == 0
  d_head = d_model // n_heads

  bias_initializer = init.RandomNormalInitializer(1e-6)
  context_bias_layer = core.Weights(bias_initializer,
                                    shape=(1, n_heads, 1, d_head))
  location_bias_layer = core.Weights(bias_initializer,
                                     shape=(1, n_heads, 1, d_head))
  return context_bias_layer, location_bias_layer
Esempio n. 14
0
    def test_custom_initializer_shape(self):
        layer = tl.Weights(
            lambda shape, rng: jnp.zeros(shape, dtype=jnp.float32), (2, 2))
        layer.init(())
        y = layer(())
        self.assertEqual(y.tolist(), [[0., 0.], [0., 0.]])

        layer = tl.Weights(init.RandomNormalInitializer(), (2, 2))
        layer.init(())
        y = layer(())
        self.assertEqual(y.shape, (2, 2))
        self.assertNotEqual(y.tolist(), [[0., 0.], [0., 0.]])
Esempio n. 15
0
 def __init__(self,
              filters,
              kernel_width=3,
              kernel_initializer=None,
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     super(CausalConv,
           self).__init__(filters=filters,
                          kernel_size=(kernel_width, ),
                          strides=None,
                          padding='VALID',
                          dimension_numbers=('NWC', 'WIO', 'NWC'),
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer)
Esempio n. 16
0
def MultiplicativeModularSparseDense(sparsity, d_feature):
    """Returns a replacement of Dense layer which uses less parameters.

  The layer uses number of modules equal to `sparsity`. It is a combination of
  multiplicative dense and locally connected dense layers.

  Args:
    sparsity: The sparsity of the layer; the output vector is divided into this
        number of modules.
    d_feature: Dimensionality of input and output tensor.
  """

    assert d_feature % sparsity == 0
    d_module = d_feature // sparsity

    return tl.Serial(
        # Weight below is used for per-head preprocessing of an embedding.
        tl.Weights(init.RandomNormalInitializer(stddev=0.5),
                   shape=[sparsity, d_feature]),
        # Weight below is a kernel of multiplicative dense, shared across heads.
        tl.Weights(init.GlorotUniformInitializer(), [d_feature, d_module]),
        # Weight below is a kernel of modular dense.
        tl.Weights(
            functools.partial(init.GlorotUniformInitializer(),
                              nonreceptive_dims=[0]),
            [sparsity, d_module, d_module]),
        # To save memory the per-head preprocessing and multiplying by
        # kernels is done in a single einsum.
        tl.Fn(
            'SparseDenseEinsum',
            (
                lambda kmod, kmult, multiplier, embeds:  # pylint: disable=g-long-lambda
                jnp.einsum('hxo,dx,hd,...d->...ho', kmod, kmult, multiplier,
                           embeds))),
        MergeLastTwoAxes(),
        # Weight below is bias after dense, per-head.
        tl.Weights(init.RandomNormalInitializer(1e-6), [d_feature]),
        tl.Add(),
    )
Esempio n. 17
0
  def __init__(self, shape=(64, 64, 3), d_embs=(384, 384, 256),
               kernel_initializer=init.RandomNormalInitializer(1.0),
               dropout=0.0, dropout_broadcast_dims=(), mode='train'):
    super().__init__()
    self._kernel_initializer = kernel_initializer
    assert len(shape) == len(d_embs)
    self._shape = shape
    self._d_embs = d_embs

    if dropout >= 1.0:
      raise ValueError('Dropout rates must be lower than 1.')
    if mode == 'train':
      self._dropout = dropout
    else:
      self._dropout = 0.0
    self._dropout_broadcast_dims = dropout_broadcast_dims
    self._mode = mode
Esempio n. 18
0
 def __init__(self,
              d_ff,
              num_experts=64,
              temperature=0.7,
              mode='train',
              kernel_initializer=init.GlorotUniformInitializer(),
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     """Returns a block sparse feed-forward block."""
     super().__init__(name=f'BlockSparseFF_{d_ff}')
     self._mode = mode
     self._d_ff = d_ff
     self._num_experts = num_experts
     self._temperature = temperature if mode == 'train' else 0.0
     self._n_elements_in_block = d_ff // num_experts
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     assert self._d_ff % self._num_experts == 0
Esempio n. 19
0
 def __init__(self, filters, kernel_size, strides=None, padding='VALID',
              dimension_numbers=('NHWC', 'HWIO', 'NHWC'),
              kernel_initializer=None,
              bias_initializer=init.RandomNormalInitializer(1e-6)):
   super().__init__()
   self._filters = filters
   self._kernel_size = kernel_size
   self._padding = padding
   self._dimension_numbers = dimension_numbers
   self._lhs_spec, self._rhs_spec, self._out_spec = dimension_numbers
   self._one = (1,) * len(kernel_size)
   self._strides = strides or self._one
   self._bias_initializer = bias_initializer
   rhs_spec = self._rhs_spec
   self._kernel_initializer = kernel_initializer
   if kernel_initializer is None:
     self._kernel_initializer = init.GlorotNormalInitializer(
         rhs_spec.index('O'), rhs_spec.index('I'))
Esempio n. 20
0
    def __init__(self,
                 n_units,
                 kernel_initializer=init.GlorotUniformInitializer(),
                 bias_initializer=init.RandomNormalInitializer(1e-6)):
        """Returns a dense / fully connected layer of width `n_units`.

    Args:
      n_units: Number of nodes in the layer, also known as the "width" of the
          layer.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights ($$W$$) for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights ($$b$$) for the layer.
    """
        super().__init__()
        self._n_units = n_units
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
Esempio n. 21
0
def LocallyConnectedDense(
        n_modules,
        n_units,
        kernel_size=1,  # pylint: disable=invalid-name
        kernel_initializer=init.GlorotUniformInitializer(),
        bias_initializer=init.RandomNormalInitializer(1e-6),
        use_bias=True):
    """Layer using LocallyConnected1d for approximation of Dense layer.

  The layer splits the last axis of a tensor into `n_modules`, then runs
  LocallyConnected1d (grouped convolution) on all those modules, and
  concatenates their results. It is essentially a locally-sensitive
  approximation of Dense layer, with number of parameters smaller by the factor
  of `n_modules / kernel_size`.

  Args:
    n_modules: Indicates how many modules (pixels) should be input and output
        split into for processing.
    n_units: how many outputs (filters) should each module generate.
    kernel_size: The size of the kernel to be used.
    kernel_initializer: Function that creates a matrix of (random) initial
        connection weights `W` for the layer.
    bias_initializer: Function that creates a vector of (random) initial
        bias weights `b` for the layer.
    use_bias: If `True`, compute an affine map `y = Wx + b`; else compute
        a linear map `y = Wx`.

  Returns:
      LocallyConnectedDense base.Layer.
  """
    if n_modules == 1:
        return tl.Dense(n_units,
                        kernel_initializer=kernel_initializer,
                        bias_initializer=bias_initializer,
                        use_bias=use_bias)
    return tl.Serial(
        tl.SplitLastAxis(n_modules),
        tl.LocallyConnected1d(n_units,
                              kernel_size,
                              kernel_initializer=kernel_initializer,
                              bias_initializer=bias_initializer,
                              use_bias=use_bias,
                              padding='WRAP'), tl.MergeLastTwoAxes())
Esempio n. 22
0
File: core.py Progetto: srush/trax
  def __init__(self,
               n_units,
               kernel_initializer=init.GlorotUniformInitializer(),
               bias_initializer=init.RandomNormalInitializer(1e-6),
               use_bias=True):
    """Returns a dense / fully connected layer of width `n_units`.

    Args:
      n_units: Number of nodes in the layer, also known as the "width" of the
          layer.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights ($$W$$) for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights ($$b$$) for the layer.
      use_bias: If True, compute an affine map: $$y = W x + b$$; else compute
          a linear map: $$y = W x$$.
    """
    super().__init__(name=f'Dense_{n_units}')
    self._n_units = n_units
    self._kernel_initializer = kernel_initializer
    self._bias_initializer = bias_initializer
    self._use_bias = use_bias
Esempio n. 23
0
    def __init__(self,
                 vocab_size,
                 d_feature,
                 kernel_initializer=init.RandomNormalInitializer(1.0)):
        """Returns an embedding layer with given vocabulary size and vector size.

    The layer clips input values (token ids) to the range `[0, vocab_size)`.
    That is, negative token ids all clip to `0` before being mapped to a
    vector, and token ids with value `vocab_size` or greater all clip to
    `vocab_size - 1` before being mapped to a vector.

    Args:
      vocab_size: Size of the input vocabulary. The layer will assign a unique
          vector to each id in `range(vocab_size)`.
      d_feature: Dimensionality/depth of the output vectors.
      kernel_initializer: Function that creates (random) initial vectors for
          the embedding.
    """
        # TODO(jonni): is the clipping behavior what we want going forward?
        super().__init__(name=f'Embedding_{vocab_size}_{d_feature}')
        self._d_feature = d_feature  # feature dimensionality
        self._vocab_size = vocab_size
        self._kernel_initializer = kernel_initializer
Esempio n. 24
0
def RelativeAttentionLayer(d_feature,
                           total_kv_pooling,
                           n_heads=1,
                           dropout=0.0,
                           n_raw_tokens_generated=1,
                           max_inference_length=3072,
                           chunk_len=None,
                           chunk_offset=None,
                           mode='train'):
  """Returns a layer that maps (q, k, v, masks) to (activations, masks).

  When number of keys is smaller than number of queries layer works in O(q^2*d).
  Otherwise it is O(q*k*d). That is because we need to shift relative distances
  by current_pooling. When we upsample this is current pooling is a fraction < 1
  Visual explanation:
  [01][23][45][67] -> [0][1][2][3][4][5][6][7]
  For token [0] we calculate relative distances as follows:
  * 0 2 4 6
  However for token [1] we need relative distances changed by 1, specifically:
  * -1 1 3 5
  So we not only need to calculate the distances that corresponds to spacing
  between the keys but also for the ones in between because there are more than
  one query tokens (on different positions which means different relative
  distances) for single key token.

  Args:
    d_feature: Depth/dimensionality of feature embedding.
    total_kv_pooling: Accumulated pool size of keys/values used at this layer.
    n_heads: Number of attention heads.
    dropout: Probabilistic rate for internal dropout applied to attention
      activations (based on query-key pairs) before dotting them with values.
    n_raw_tokens_generated: Number of tokens generated in a single pass through
      this layer. Used only in 'predict' non-training mode.
    max_inference_length: Maximum sequence length allowed in non-training
      modes.
    chunk_len (optional): Number of tokens per chunk. Setting this option will
      enable chunked attention.
    chunk_offset (optional): Offset for shifting chunks, for shifted chunked
      attention
    mode: One of `'train'`, `'eval'`, or `'predict'`.
  """
  pos_emb = PositionalEmbeddings(
      d_feature,
      total_kv_pooling,
      max_inference_length=max_inference_length,
      chunk_len=chunk_len,
      chunk_offset=chunk_offset,
      n_raw_tokens_generated=n_raw_tokens_generated,
      mode=mode)

  attention = RelativeAttention(  # pylint: disable=no-value-for-parameter
      total_kv_pooling=total_kv_pooling,
      n_heads=n_heads,
      dropout=dropout,
      n_raw_tokens_generated=n_raw_tokens_generated,
      max_inference_length=max_inference_length,
      chunk_len=chunk_len,
      chunk_offset=chunk_offset,
      mode=mode),

  assert d_feature % n_heads == 0
  d_head = d_feature // n_heads
  context_bias_layer = core.Weights(
      init.RandomNormalInitializer(1e-6), shape=(1, n_heads, 1, d_head))
  location_bias_layer = core.Weights(
      init.RandomNormalInitializer(1e-6), shape=(1, n_heads, 1, d_head))

  return cb.Serial(
      cb.Branch(
          cb.Serial(pos_emb, core.Dense(d_feature)),
          core.Dense(d_feature),
          core.Dense(d_feature),
          core.Dense(d_feature),
          cb.Select([1])  # mask
      ),
      context_bias_layer,
      location_bias_layer,
      attention,
      core.Dense(d_feature),
  )
Esempio n. 25
0
 def test_shape(self):
     layer = tl.Weights(init.RandomNormalInitializer(), (5, 10, 3))
     layer.init(())
     y = layer(())
     self.assertEqual(y.shape, (5, 10, 3))
Esempio n. 26
0
 def test_simple_custom_initializer(self):
     layer = tl.Weights(init.RandomNormalInitializer())
     layer.init(())
     y = layer(())
     self.assertEqual(y.shape, ())
     self.assertNotEqual(y.tolist(), 0.)
Esempio n. 27
0
 def __init__(self, initializer=init.RandomNormalInitializer(0.01)):
     super(ShiftRightLearned, self).__init__()
     self._initializer = initializer
Esempio n. 28
0
 def test_random_normal(self):
     initializer = initializers.RandomNormalInitializer()
     input_shape = (29, 5, 7, 20)
     init_value = initializer(input_shape, random.get_prng(0))
     self.assertEqual(tuple(init_value.shape), input_shape)