Ejemplo n.º 1
0
 def call(self, inputs, params=(), rng=None, **kwargs):
   del params
   q, k, v = inputs
   mask_size = q.shape[-2]
   mask = np.tril(np.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0)
   res = tl.DotProductAttention(
       q, k, v, mask, dropout=self._dropout, mode=self._mode, rng=rng)
   return res
Ejemplo n.º 2
0
 def call(self, inputs, params=(), state=(), rng=None, **kwargs):
   del params
   q, k, v = inputs
   mask_size = q.shape[-2]
   # Not all backends define np.tril. However, using onp.tril is inefficient in
   # that it creates a large global constant. TODO(kitaev): try to find an
   # alternative that works across all backends.
   if backend.get_name() == 'jax':
     mask = np.tril(np.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0)
   else:
     mask = onp.tril(onp.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0)
   res = DotProductAttention(
       q, k, v, mask, dropout=self._dropout, mode=self._mode, rng=rng)
   return res, state
Ejemplo n.º 3
0
  def forward(self, inputs, params=(), state=(), rng=None, **kwargs):
    del params
    q, k, v = inputs
    if self._mode in ('train', 'eval'):
      mask_size = q.shape[-2]
      # Not all backends define np.tril. However, using onp.tril is inefficient
      # in that it creates a large global constant. TODO(kitaev): try to find an
      # alternative that works across all backends.
      if backend.get_name() == 'jax':
        mask = np.tril(
            np.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0)
      else:
        mask = onp.tril(
            onp.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0)
    else:
      assert self._mode == 'predict'
      state = _fast_inference_update_state(inputs, state)
      (k, v, mask, _) = state

    res = DotProductAttention(
        q, k, v, mask, dropout=self._dropout, mode=self._mode, rng=rng)
    return res, state