Example #1
0
    def __init__(self,
                 n_units,
                 kernel_initializer=init.GlorotUniformInitializer(),
                 bias_initializer=init.RandomNormalInitializer(1e-6),
                 use_bias=True,
                 use_bfloat16=False):
        """Returns a dense (fully connected) layer of width `n_units`.

    A dense layer maps collections of `R^m` vectors to `R^n`, where `n`
    (`= n_units`) is fixed at layer creation time, and `m` is set at layer
    initialization time.

    Args:
      n_units: Number of nodes in the layer, also known as the width of the
          layer.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights `W` for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights `b` for the layer.
      use_bias: If `True`, compute an affine map `y = Wx + b`; else compute
          a linear map `y = Wx`.
      use_bfloat16: If `True`, use bfloat16 weights instead of the default
        float32; this can save memory but may (rarely) lead to numerical issues.
    """
        super().__init__(name=f'Dense_{n_units}')
        self._n_units = n_units
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
        self._use_bias = use_bias
        self._use_bfloat16 = use_bfloat16
Example #2
0
def EinsumDense(d_input, d_output, use_bias):
    """Returns a reimplementation of Dense layer, using einsum.

  While this is an equivalent of a Dense layer, it seems to be faster when used
  in decoding if used with bias (see decoding_timing_test.py ).
  This layer can be removed when we understand better the reason for the
  difference in decoding speed.

  Args:
    d_input: Dimensionality of the input tensor.
    d_output: Dimensionality of the output tensor.
    use_bias: Whether to use bias.
  """
    layers = [
        tl.Weights(init.GlorotUniformInitializer(), [d_output, d_input]),
        tl.Fn(
            'EinsumDense',
            (
                lambda kernel, embeds:  # pylint: disable=g-long-lambda
                jnp.einsum('xd,...d->...x', kernel, embeds)))
    ]
    if use_bias:
        layers.extend([
            tl.Weights(init.RandomNormalInitializer(1e-6), [d_output]),
            tl.Add()
        ])
    return tl.Serial(layers)
Example #3
0
  def init_weights_and_state(self, input_signature):
    """Randomly initializes the positional encoding vectors.

    Args:
      input_signature: :py:class:`ShapeDtype` instance characterizing the input
          this layer should compute on.
    """
    d_feature = input_signature.shape[-1]
    if self._d_feature is not None:
      d_feature = self._d_feature
    pe = np.zeros((self._max_len, d_feature), dtype=np.float32)
    position = np.arange(0, self._max_len)[:, np.newaxis]
    div_term = np.exp(
        np.arange(0, d_feature, 2) * -(np.log(10000.0) / d_feature))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)  # [self._max_len, d_feature]
    if self._use_bfloat16:
      pe = pe.astype(jnp.bfloat16)
    w = jnp.array(pe)  # Trainable parameters, initialized above.
    if self._d_feature is not None:
      ff = init.GlorotUniformInitializer()(
          (d_feature, input_signature.shape[-1]), self.rng)
      self.weights = w, ff
    else:
      self.weights = w
    if self._mode == 'predict':
      self.state = jnp.zeros((), dtype=jnp.int32)
Example #4
0
 def __init__(self,
              d_ff,
              n_elements_in_block=32,
              d_lowrank=64,
              temperature=0.1,
              quant_prob=0.3,
              use_bfloat16=False,
              big_weights_in_bfloat16=True,
              mode='train',
              kernel_initializer=init.GlorotUniformInitializer(),
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     """Returns a sparse feed-forward block."""
     super().__init__(name=f'SparseFF_{d_ff}')
     self._mode = mode
     self._use_bfloat16 = use_bfloat16
     self._big_weights_in_bfloat16 = big_weights_in_bfloat16
     self._d_ff = d_ff
     self._d_lowrank = d_lowrank
     # Q: what temperature is actually most useful in training?
     self._temperature = temperature if mode == 'train' else 0.0
     self._quant_prob = quant_prob
     self._n_elements_in_block = n_elements_in_block
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     # Helper numbers as d_ff will be divided by n_elements_in_block.
     assert self._d_ff % self._n_elements_in_block == 0
     self._d1 = self._d_ff // self._n_elements_in_block
     self._d2 = self._n_elements_in_block
Example #5
0
    def __init__(self,
                 filters,
                 kernel_size,
                 kernel_initializer=init.GlorotUniformInitializer(),
                 bias_initializer=init.RandomNormalInitializer(1e-6),
                 use_bias=True,
                 padding='VALID'):
        """Returns a locally-connected conv-like layer.

    Args:
      filters: Number of output filters in the convolution.
      kernel_size: A length of the convolution window. Must be an odd number.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights `W` for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights `b` for the layer.
      use_bias: If `True`, the layer uses a bias vector.
      padding: The type of padding to use; must be 'VALID', 'SAME', or 'WRAP'.
    """
        super().__init__(name=f'LocallyConnected1d_{filters}_{kernel_size}')
        self._filters = filters
        self._kernel_size = kernel_size
        assert self._kernel_size % 2 == 1  # kernel size has to be odd
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
        self._use_bias = use_bias
        self._padding = padding
Example #6
0
 def __init__(self,
              d_feature,
              vocab_size,
              kernel_initializer=init.GlorotUniformInitializer()):
     super(Embedding, self).__init__()
     self._d_feature = d_feature  # feature dimensionality
     self._vocab_size = vocab_size
     self._kernel_initializer = kernel_initializer
Example #7
0
 def __init__(self,
              n_heads=1,
              d_model=1024,
              kernel_initializer=init.GlorotUniformInitializer()):
     super(ComputeAttentionOutput, self).__init__()
     self._n_heads = n_heads
     self._d_model = d_model
     self._kernel_initializer = kernel_initializer
Example #8
0
 def __init__(self,
              n_heads=1,
              d_head=64,
              kernel_initializer=init.GlorotUniformInitializer()):
     super(ComputeAttentionHeads, self).__init__()
     self._n_heads = n_heads
     self._d_head = d_head
     self._kernel_initializer = kernel_initializer
Example #9
0
 def __init__(self,
              n_units,
              kernel_initializer=init.GlorotUniformInitializer(),
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     super(Dense, self).__init__()
     self._n_units = n_units
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
Example #10
0
 def __init__(self,
              kernel_size=3,
              kernel_initializer=init.GlorotUniformInitializer(),
              use_bfloat16=False):
     """Returns a causal depthwise convolution layer."""
     super().__init__(n_in=1, n_out=1)
     self._kernel_size = kernel_size
     self._kernel_initializer = kernel_initializer
     self._use_bfloat16 = use_bfloat16
Example #11
0
 def __init__(self,
              n_units,
              forget_bias=1.0,
              kernel_initializer=initializers.GlorotUniformInitializer(),
              bias_initializer=initializers.RandomNormalInitializer(1e-6)):
     super(LSTMCell, self).__init__(n_in=2, n_out=2)
     self._n_units = n_units
     self._forget_bias = forget_bias
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
Example #12
0
def MultiplicativeModularSparseDense(sparsity, d_feature):
    """Returns a replacement of Dense layer which uses less parameters.

  The layer uses number of modules equal to `sparsity`. It is a combination of
  multiplicative dense and locally connected dense layers.

  Args:
    sparsity: The sparsity of the layer; the output vector is divided into this
        number of modules.
    d_feature: Dimensionality of input and output tensor.
  """

    assert d_feature % sparsity == 0
    d_module = d_feature // sparsity

    return tl.Serial(
        # Weight below is used for per-head preprocessing of an embedding.
        tl.Weights(init.RandomNormalInitializer(stddev=0.5),
                   shape=[sparsity, d_feature]),
        # Weight below is a kernel of multiplicative dense, shared across heads.
        tl.Weights(init.GlorotUniformInitializer(), [d_feature, d_module]),
        # Weight below is a kernel of modular dense.
        tl.Weights(
            functools.partial(init.GlorotUniformInitializer(),
                              nonreceptive_dims=[0]),
            [sparsity, d_module, d_module]),
        # To save memory the per-head preprocessing and multiplying by
        # kernels is done in a single einsum.
        tl.Fn(
            'SparseDenseEinsum',
            (
                lambda kmod, kmult, multiplier, embeds:  # pylint: disable=g-long-lambda
                jnp.einsum('hxo,dx,hd,...d->...ho', kmod, kmult, multiplier,
                           embeds))),
        MergeLastTwoAxes(),
        # Weight below is bias after dense, per-head.
        tl.Weights(init.RandomNormalInitializer(1e-6), [d_feature]),
        tl.Add(),
    )
Example #13
0
def MultiplicativeSparseDense(sparsity,
                              d_input,
                              d_output=None,
                              use_bias=True,
                              use_bfloat16=False):
    """Returns a replacement of Dense layer which uses less parameters.

  The layer uses number of modules equal to `sparsity`. It multiplies each
  dimension of the input tensor by a scalar specific to each dimension and each
  module separately; then it applies Dense(d_output/sparsity) to each module.
  Compared to standard dense layer, MultiplicativeSparseDense uses less
  parameters while still being able to express many interesting functions (for
  example a permutation).

  Args:
    sparsity: The sparsity of the layer; the output vector is divided into this
        number of modules.
    d_input: Dimensionality of input tensor.
    d_output: Dimensionality of output tensor; by default equal to d_input.
    use_bias: Whether to use bias.
    use_bfloat16: Whether to use bfloat16 for weights.
  """

    assert d_output % sparsity == 0
    d_module = d_output // sparsity

    layers = [
        # Weight below is used for per-head preprocessing of an embedding.
        tl.Weights(init.RandomNormalInitializer(stddev=0.5),
                   shape=[sparsity, d_input],
                   use_bfloat16=use_bfloat16),
        # Weight below is dense kernel, shared across heads.
        tl.Weights(init.GlorotUniformInitializer(), [d_input, d_module],
                   use_bfloat16=use_bfloat16),
        # To save memory the per-head preprocessing and multiplying by the
        # kernel is done in the same einsum.
        tl.Fn(
            'AttentionEinsum',
            (
                lambda kernel, multiplier, embeds:  # pylint: disable=g-long-lambda
                jnp.einsum('dx,hd,...d->...hx', kernel, multiplier, embeds))),
        MergeLastTwoAxes(),
    ]
    if use_bias:
        layers.extend([
            # Weight below is bias after dense, per-head.
            tl.Weights(init.RandomNormalInitializer(1e-6), [d_output],
                       use_bfloat16=use_bfloat16),
            tl.Add(),
        ])
    return tl.Serial(layers)
Example #14
0
 def __init__(self,
              d_ff,
              num_experts=64,
              temperature=0.7,
              mode='train',
              kernel_initializer=init.GlorotUniformInitializer(),
              bias_initializer=init.RandomNormalInitializer(1e-6)):
     """Returns a block sparse feed-forward block."""
     super().__init__(name=f'BlockSparseFF_{d_ff}')
     self._mode = mode
     self._d_ff = d_ff
     self._num_experts = num_experts
     self._temperature = temperature if mode == 'train' else 0.0
     self._n_elements_in_block = d_ff // num_experts
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     assert self._d_ff % self._num_experts == 0
Example #15
0
    def __init__(self,
                 n_units,
                 kernel_initializer=init.GlorotUniformInitializer(),
                 bias_initializer=init.RandomNormalInitializer(1e-6)):
        """Returns a dense / fully connected layer of width `n_units`.

    Args:
      n_units: Number of nodes in the layer, also known as the "width" of the
          layer.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights ($$W$$) for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights ($$b$$) for the layer.
    """
        super().__init__()
        self._n_units = n_units
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
Example #16
0
def LocallyConnectedDense(
        n_modules,
        n_units,
        kernel_size=1,  # pylint: disable=invalid-name
        kernel_initializer=init.GlorotUniformInitializer(),
        bias_initializer=init.RandomNormalInitializer(1e-6),
        use_bias=True):
    """Layer using LocallyConnected1d for approximation of Dense layer.

  The layer splits the last axis of a tensor into `n_modules`, then runs
  LocallyConnected1d (grouped convolution) on all those modules, and
  concatenates their results. It is essentially a locally-sensitive
  approximation of Dense layer, with number of parameters smaller by the factor
  of `n_modules / kernel_size`.

  Args:
    n_modules: Indicates how many modules (pixels) should be input and output
        split into for processing.
    n_units: how many outputs (filters) should each module generate.
    kernel_size: The size of the kernel to be used.
    kernel_initializer: Function that creates a matrix of (random) initial
        connection weights `W` for the layer.
    bias_initializer: Function that creates a vector of (random) initial
        bias weights `b` for the layer.
    use_bias: If `True`, compute an affine map `y = Wx + b`; else compute
        a linear map `y = Wx`.

  Returns:
      LocallyConnectedDense base.Layer.
  """
    if n_modules == 1:
        return tl.Dense(n_units,
                        kernel_initializer=kernel_initializer,
                        bias_initializer=bias_initializer,
                        use_bias=use_bias)
    return tl.Serial(
        tl.SplitLastAxis(n_modules),
        tl.LocallyConnected1d(n_units,
                              kernel_size,
                              kernel_initializer=kernel_initializer,
                              bias_initializer=bias_initializer,
                              use_bias=use_bias,
                              padding='WRAP'), tl.MergeLastTwoAxes())
Example #17
0
File: core.py Project: srush/trax
  def __init__(self,
               n_units,
               kernel_initializer=init.GlorotUniformInitializer(),
               bias_initializer=init.RandomNormalInitializer(1e-6),
               use_bias=True):
    """Returns a dense / fully connected layer of width `n_units`.

    Args:
      n_units: Number of nodes in the layer, also known as the "width" of the
          layer.
      kernel_initializer: Function that creates a matrix of (random) initial
          connection weights ($$W$$) for the layer.
      bias_initializer: Function that creates a vector of (random) initial
          bias weights ($$b$$) for the layer.
      use_bias: If True, compute an affine map: $$y = W x + b$$; else compute
          a linear map: $$y = W x$$.
    """
    super().__init__(name=f'Dense_{n_units}')
    self._n_units = n_units
    self._kernel_initializer = kernel_initializer
    self._bias_initializer = bias_initializer
    self._use_bias = use_bias
Example #18
0
 def test_glorot_uniform(self):
     initializer = initializers.GlorotUniformInitializer()
     input_shape = (29, 5, 7, 20)
     init_value = initializer(input_shape, random.get_prng(0))
     self.assertEqual(tuple(init_value.shape), input_shape)