Esempio n. 1
0
    def prepare(self, query, key=None, value=None, mask=None):
        r""" Preparing input for attention model

    Returns:
      query: Query (or target sequence) tensor of shape `[batch_size, Tq, dim]`.
      key: Key (or source sequence) tensor of shape `[batch_size, Tv, dim]`.
      value: Value (or source sequence) tensor of shape `[batch_size, Tv, dim]`.
      mask: list of the following
        * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
            If given, the output will be zero at the positions where
            `mask==False`.
        * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
            If given, will apply the mask such that values at positions where
            `mask==False` do not contribute to the result.
    """
        # by default, if key is not provide, using value
        query = bk.array(query, ignore_none=True)
        key = bk.array(key, ignore_none=True)
        value = bk.array(value, ignore_none=True)
        # ====== check if intra-attention ====== #
        if self.is_self_attention:
            if (key is not None or value is not None):
                warnings.warn(
                    "Self-attention (intra-attention) need only query, "
                    "ignore provided key and value",
                    category=UserWarning)
            if key is not None:
                key = query
            if value is not None:
                value = query
        ### inter-attention
        else:
            if key is None:
                key = value
            if value is None:  # value must always provided
                raise RuntimeError(
                    "value must be given of inter-sequences attention.")
        # ====== masks ====== #
        if self.is_self_attention:  # only 1 mask is need
            if isinstance(mask, (tuple, list)):
                q_mask = mask[0]
            else:
                q_mask = mask
            v_mask = None
        else:
            q_mask = mask[0] if mask else None
            v_mask = mask[1] if mask else None
            if v_mask is not None:
                if v_mask.shape[1] != value.shape[1]:
                    raise RuntimeError(
                        "Value mask has time dimension %d, but value has time dimension %d"
                        % (v_mask.shape[1], value.shape[1]))
        # ====== return ====== #
        return query, key, value, \
          bk.array(q_mask, ignore_none=True), bk.array(v_mask, ignore_none=True)
Esempio n. 2
0
 def test_matmul(self):
   for shape1, shape2, outshape in [
       [(2, 3), (4, 3, 5), (4, 2, 5)],
       [(2, 3, 4), (4, 5), (2, 3, 5)],
       [(5, 3, 4), (5, 4, 6), (5, 3, 6)],
   ]:
     x = np.random.rand(*shape1)
     y = np.random.rand(*shape2)
     for fw in FRAMEWORKS:
       a = bk.array(x, fw)
       b = bk.array(y, fw)
       c = bk.matmul(a, b)
       self.assertEqual(c.shape, outshape, msg=fw)
Esempio n. 3
0
 def test_countnonzero(self):
   x = np.random.randint(0, 10, size=(25, 12, 8))
   for axis in (None, 0, 1, 2, (1, 2)):
     for keepdims in (True, False):
       for dtype in ('int32', 'float32'):
         y = [
             bk.count_nonzero(bk.array(x, fw),
                              axis=axis,
                              keepdims=keepdims,
                              dtype=dtype) for fw in FRAMEWORKS
         ]
         assert_equal(self, (axis, keepdims, dtype), *y)
Esempio n. 4
0
    def __init__(self,
                 output_dim,
                 max_len=10000,
                 trainable=False,
                 mask_zero=False):
        super().__init__()
        self.output_dim = output_dim
        self.mask_zero = bool(mask_zero)
        self.trainable = bool(trainable)
        self.supports_masking = mask_zero
        self.max_len = max_len

        # Applying the cosine to even columns and sin to odds.
        # if zero-masked, dont use the 0 position
        # (i - i % 2) create a sequence of (0,0,1,1,2,2,...) which is needed
        # for two running sequence of sin and cos in odd and even position
        position_encoding = np.array([[
            pos / np.power(10000, (i - i % 2) / output_dim)
            for i in range(output_dim)
        ] if pos != 0 or not mask_zero else [0.] * output_dim
                                      for pos in range(max_len)])
        # [max_len, output_dim]
        position_encoding[:, 0::2] = np.sin(position_encoding[:,
                                                              0::2])  # dim 2i
        position_encoding[:,
                          1::2] = np.cos(position_encoding[:,
                                                           1::2])  # dim 2i+1
        if not trainable:
            self.position_encoding = bk.array(position_encoding,
                                              dtype='float32',
                                              framework=self)
        else:
            self.position_encoding = bk.variable(
                initial_value=position_encoding,
                dtype='float32',
                trainable=True,
                framework=self)
Esempio n. 5
0
 def compute_mask(self, mask=None):
     if mask:
         q_mask = mask[0] if isinstance(mask, (tuple, list)) else mask
         return bk.array(q_mask)
Esempio n. 6
0
    def align(self,
              scores,
              value,
              query=None,
              q_mask=None,
              v_mask=None,
              causal=False,
              residual=False,
              dropout=0,
              temporal_dropout=False,
              sample_shape=1,
              temperature=0.5,
              training=None):
        r"""Applies attention scores to the given value tensor.

    Arguments:
      scores: Attention Scores float tensor of shape
        `[num_heads, batch_size, Tq, Tv]`.
      value: Value (or source sequence) tensor of shape
        `[num_heads, batch_size, Tv, dim]`.
      query: Query (or target sequence) tensor of shape
        `[num_heads, batch_size, Tq, dim]`.
      q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
      dropout : Float. Dropout probability of the attention scores.
      temporal_dropout : Boolean. If `True`, using the same dropout mask along
        temporal axis (i.e. the 1-st dimension)
      sample_shape (`Integer`) : number of mcmc samples for estimating the gradient
        of hard attention
      temperature: An 0-D `Tensor`, representing the temperature
        of a set of RelaxedOneHotCategorical distributions. The temperature
        should be positive.

    Returns:
      attended sequence: Tensor of shape
        * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads)
        * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head)
        * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads)
        * `[batch_size, Tq, dim]` for (soft + no-head)
      attention distribution : for soft attention, return Tensor of shape
        * `[num_heads, batch_size, Tq]` for self-attention
        * `[num_heads, batch_size, Tq, Tv]` for inter-attention.
        for hard attention, return one-hot categorical distribution of shape
        * `[sample_shape, num_heads, batch_size, Tq]` for self-attention
        * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention.
        if multi-heads attention wasn't used, omit the `[num_heads]`.
    """
        num_heads = _get_num_heads(scores)
        if num_heads == 0:
            Tq = scores.shape[1]
            Tv = scores.shape[2]
        else:
            Tq = scores.shape[2]
            Tv = scores.shape[3]
        if value is None:
            if query is None:
                raise ValueError("both query and value are None, "
                                 "at least one of them must be given")
            value = query
        # ====== Causal mask ====== #
        if causal:
            # Creates a lower triangular mask, so position i cannot attend to
            # positions j>i. This prevents the flow of information from the future
            # into the past.
            scores_shape = scores.shape
            # causal_mask_shape = [1, Tq, Tv].
            causal_mask_shape = bk.concatenate(
                [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0)
            causal_mask = bk.tril_mask(causal_mask_shape)
        else:
            causal_mask = None
        if v_mask is not None:
            # LocalM applied
            if PosLocalM in self:
                v_mask = v_mask[:, -Tv:]
            # Mask of shape [batch_size, 1, Tv].
            v_mask = bk.expand_dims(v_mask, axis=-2)
            v_mask = bk.cast(v_mask, 'bool')
            if num_heads > 0:
                v_mask = bk.expand_dims(v_mask, axis=0)
        scores_mask = bk.logical_and(v_mask, causal_mask)
        ### applying the scores mask
        if scores_mask is not None:
            padding_mask = bk.logical_not(scores_mask)
            # Bias so padding positions do not contribute to attention distribution.
            scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype)
        # ====== convert attention score to distribution ====== #
        # if the last dimension is 1, no point for applying softmax, hence,
        # softmax to the second last dimension
        ### soft attention
        if AlignSoft in self:
            attention_distribution = bk.softmax(
                scores, axis=-2 if scores.shape[-1] == 1 else -1)
        ### relaxed hard attention
        elif AlignRelax in self:
            attention_distribution = bay.distributions.RelaxedOneHotCategorical(
                temperature=temperature,
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        ### hard attention
        elif AlignHard in self:
            attention_distribution = bay.distributions.OneHotCategorical(
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores,
                dtype=value.dtype)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        # ======  dropout the attention scores ====== #
        attention = bk.dropout(attention_distribution,
                               p_drop=dropout,
                               axis=1 if temporal_dropout else None,
                               training=training and dropout > 0)
        # ====== applying the attention ====== #
        if self.is_self_attention and ScoreLocation in self:
            result = bk.expand_dims(bk.array(attention), axis=-1) * value  \
                if attention.shape[-1] != 1 else \
                  attention * value
        else:
            if PosLocalM in self:
                value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:]
            result = bk.matmul(attention, value)
        # ====== applying the Query mask ====== #
        if q_mask is not None:
            assert q_mask.shape[1] == Tq,\
              "Query mask has time dimension %d, but query has time dimension %d" \
                % (q_mask.shape[1], Tq)
            # Mask of shape [batch_size, Tq, 1].
            q_mask = bk.expand_dims(q_mask, axis=-1)
            result *= bk.cast(q_mask, dtype=result.dtype)
        # ====== residual connection ====== #
        if residual:
            if query is None:
                raise ValueError("query must be given for residual connection")
            result += query
        # ====== return ====== #
        return result, attention_distribution
Esempio n. 7
0
    def score(self,
              query,
              key=None,
              scale=1,
              window_width=None,
              q_proj=None,
              target_proj=None):
        r"""
    Arguments:
      query: Query (or target sequence) tensor of shape
        `[batch_size, Tq, dim]` or `[num_heads, batch_size, Tq, dim]` in case
        of multi-heads attention.
      key: Key (or source sequence) tensor of shape
        `[batch_size, Tv, dim]` or `[num_heads, batch_size, Tv, dim]` in case
        of multi-heads attention.
      scale: single `Scalar` or `Tensor` of shape `[dim]` for scaling
        the attention scores, suggested `1/sqrt(dim)` in (Vaswani et al. 2017).
      window_width : `None`, `Integer` or `Float` ([0, 1]). The total number of
        frames for a single window in local attention (i.e. `left + 1 + right`)
        Can be given as a fixed number of frames (`int`), or percentage of
        the sequence length (`float`). If `None`, use `Tq`
      q_proj : `Dense`, instance of dense or fully connected layer
        - for `ScoreLocation`, the number of hidden unit is `1`
        - for `ScoreGeneral`, the number of hidden unit is `dim`
      target_proj : `Dense`, for predictive local attention, applying
        a fully connected network on target sequence (i.e. the query) to
        predict the position on source sequence (i.e. the key).
        The layer must has output dimension equal to 1 and return logit value.

    Returns:
      Tensor of shape `[num_heads, batch_size, Tq, Tv]`, or
       `[num_heads, batch_size, Tq, 1]` if `ScoreLocation`
    """
        ### Check if multi-head attention is used
        num_heads = _get_num_heads(query)
        if num_heads > 0:
            query = bk.reshape(query, [-1] + [i for i in query.shape[2:]])
            if key is not None:
                key = bk.reshape(key, [-1] + [i for i in key.shape[2:]])
        Tq = query.shape[1]
        Tv = Tq if key is None else key.shape[1]
        # scale shape is `[]` or `[dim]`
        scale = bk.array(scale, dtype=query.dtype)
        ### Check the window width
        if window_width is None:
            window_width = Tq
        elif window_width < 1:
            window_width = window_width * Tv
        window_width = int(window_width)
        ### Locative attention
        if AttentionMechanism.ScoreLocation in self:
            if PosLocalM in self or PosLocalP in self:
                raise NotImplementedError(
                    "ScoreLocation only support Global attention, but given: %s"
                    % str(self))
            # [batch_size * num_heads, Tq, dim]
            scores = bk.reduce_mean(scale) * q_proj(query)
            assert scores.shape[-1] == 1, \
              " q_proj must have only 1 hidden unit, but given %d" % scores.shape[-1]
        ### Other score mode need the key tensor
        else:
            if key is None:
                raise ValueError(
                    "key must be provided for attention type: %s" % str(self))
            ### Attention position (local or global)
            if PosLocalM in self:
                key = key[:, -window_width:]
            elif PosLocalP in self:
                pt = bk.sigmoid(target_proj(bk.reshape(query, ([0], -1))))
                assert pt.shape[-1] == 1, \
                  "target_proj must project the query [., Tq * dim] to [., 1], i.e. " + \
                    "predicting the attention position on source sequence using " + \
                      "knowledge from target sequence."
                pt = Tv * pt  # `[batch_size * num_heads, 1]`
                # `[batch_size * num_heads, Tv]`
                # Eq (10) (Luong et al. 2015)
                gauss_est = bk.exp(
                    -bk.square(bk.arange(Tv, dtype=pt.dtype) - pt) /
                    (2 * bk.square(window_width / 2)))
                # `[batch_size * num_heads, 1, Tv]`
                gauss_est = bk.expand_dims(gauss_est, axis=1)
            ### Additive or concat method
            if AttentionMechanism.ScoreAdditive in self:
                # [batch_size * num_heads, Tq, 1, dim]
                q = bk.expand_dims(query, axis=2)
                # [batch_size * num_heads, 1, Tv, dim]
                k = bk.expand_dims(key, axis=1)
                # [batch_size * num_heads, Tq, Tv]
                scores = bk.reduce_sum(scale * bk.tanh(q + k), axis=-1)
            ### Dot product or multiplicative scoring
            elif AttentionMechanism.ScoreDotProd in self:
                # this is a trick to make attention_scale broadcastable when
                # scale_tied=False
                scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2))
            ### cosine scoring
            elif AttentionMechanism.ScoreCosine in self:
                # [batch_size * num_heads, Tq, 1, dim]
                q = bk.expand_dims(query, axis=2)
                # [batch_size * num_heads, 1, Tv, dim]
                k = bk.expand_dims(key, axis=1)
                # [batch_size * num_heads, Tq, Tv, dim]
                scores = (q * k) / (bk.norm(q, p=2) * bk.norm(k, p=2))
                scores = bk.reduce_sum(scale * scores, axis=-1, keepdims=False)
            ### general method with only project on the query
            elif AttentionMechanism.ScoreGeneral in self:
                query = q_proj(query)
                assert query.shape[-1] == key.shape[-1], \
                  " q_proj must have %d hidden units, but given %d units" % \
                    (key.shape[-1], query.shape[-1])
                scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2))
            else:
                raise NotImplementedError(
                    "No support for attention_type='%s'" % str(self))
            ### applying the local-predictive attention
            if PosLocalP in self:
                scores = scores * gauss_est
        ### get back the multi-heads shape
        if num_heads > 0:
            scores = bk.reshape(scores,
                                shape=[num_heads, -1] +
                                [i for i in scores.shape[1:]])
        return scores