def call(self, inputs, params=(), rng=None, **kwargs): del params q, k, v = inputs mask_size = q.shape[-2] mask = np.tril(np.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0) res = tl.DotProductAttention( q, k, v, mask, dropout=self._dropout, mode=self._mode, rng=rng) return res
def call(self, inputs, params=(), state=(), rng=None, **kwargs): del params q, k, v = inputs mask_size = q.shape[-2] # Not all backends define np.tril. However, using onp.tril is inefficient in # that it creates a large global constant. TODO(kitaev): try to find an # alternative that works across all backends. if backend.get_name() == 'jax': mask = np.tril(np.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0) else: mask = onp.tril(onp.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0) res = DotProductAttention( q, k, v, mask, dropout=self._dropout, mode=self._mode, rng=rng) return res, state
def forward(self, inputs, params=(), state=(), rng=None, **kwargs): del params q, k, v = inputs if self._mode in ('train', 'eval'): mask_size = q.shape[-2] # Not all backends define np.tril. However, using onp.tril is inefficient # in that it creates a large global constant. TODO(kitaev): try to find an # alternative that works across all backends. if backend.get_name() == 'jax': mask = np.tril( np.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0) else: mask = onp.tril( onp.ones((1, mask_size, mask_size), dtype=onp.bool_), k=0) else: assert self._mode == 'predict' state = _fast_inference_update_state(inputs, state) (k, v, mask, _) = state res = DotProductAttention( q, k, v, mask, dropout=self._dropout, mode=self._mode, rng=rng) return res, state