def call(self, sequence, training=None):
     with bk.framework_(self):
         # [batch_size, time_dim]
         positions = bk.tile(
             bk.expand_dims(bk.arange(sequence.shape[1]), 0),
             [sequence.shape[0], 1])
         dtype = bk.dtype_universal(positions.dtype)
         if dtype not in ('int32', 'int64'):
             positions = bk.cast(positions, dtype='int32')
         pe = bk.embedding(indices=positions, weight=self.position_encoding)
         return pe
Beispiel #2
0
 def _apply(self, x):
     # store last input for deconvolution ops
     self._last_input = x
     conved = self.convolve(x)
     output_shape = K.get_shape(conved)
     if not hasattr(self, 'b'):
         conved = conved
     elif self.untie_biases:
         conved += K.expand_dims(self.b, 0)
     else:
         conved += K.dimshuffle(self.b, ('x', ) * (self.ndim + 1) + (0, ))
     activated = self.activation(conved)
     K.add_shape(activated, output_shape)
     # set shape for output
     return activated
Beispiel #3
0
 def _apply(self, X, h0=None, c0=None, mask=None, **kwargs):
     # check input_shape
     input_shape = K.get_shape(X)
     # ====== check mask ====== #
     if mask is not None and (K.ndim(mask) != 2
                              or K.get_shape(mask)[-1] != input_shape[1]):
         raise Exception('Mask must be a 2-D matrix and the time dimension '
                         '(i.e. the second dimension) must equal to "%d"'
                         ', but the given mask has shape "%s".' %
                         (input_shape[1], K.get_shape(mask)))
     # add broadcastable dimension for mask
     if mask is not None:
         mask = K.expand_dims(mask, dim=-1)
     # ====== initialize states ====== #
     # hidden states
     h0 = _check_rnn_hidden_states(h0, self, input_shape, 'h0')
     c0 = _check_rnn_hidden_states(c0, self, input_shape, 'c0')
     # turn off repeat_states if batch_size already included
     if K.get_shape(h0)[0] != 1 and K.get_shape(c0)[0] != 1:
         self.repeat_states = False
     # ====== precompute input ====== #
     # linear or norm input mode
     if self.input_mode != 'skip':
         X = K.dot(X, self.W_in)
         if self.input_mode == 'norm':
             # normalize all axes except the time dimension
             bn = BatchNorm(axes=(0, 1),
                            activation=K.linear,
                            gamma_init=self.gamma,
                            beta_init=self.beta,
                            mean_init=self.mean,
                            inv_std_init=self.inv_std)
             X = bn(X)
     # skip input
     elif input_shape[-1] == self.num_units:
         X = K.repeat(X, 4, axes=-1)
     # ====== compute recurrent output ====== #
     out = self._rnn(X,
                     h0=h0,
                     c0=c0,
                     mask=mask,
                     **self.get_recurrent_info(kwargs))
     if not self.return_cell_memory:
         out = out[:-1]
     for i in out:
         K.add_shape(i, shape=input_shape[:-1] + (self.num_units, ))
     # only care about the first state
     return out[0] if len(out) == 1 else out
Beispiel #4
0
    def call(self, inputs, training=None):
        # anyway, if the smallest value is negative,
        # start from 0 (i.e. relative position)
        shape = inputs.shape
        timestep = shape[1]
        y = []

        for delay, layer in zip(self.delays, self.all_layers):
            start = delay
            end = timestep - self.context_length + delay + 1 - self.min_delay
            y.append(expand_dims(layer(inputs[:, start:end]), axis=0))

        y = concatenate(y, axis=0)
        y = self.fn_pooling(y, axis=0)

        if isinstance(self.pooling, string_types) and \
          'none' in self.pooling.lower() and \
            self.context_length == 1:
            y = squeeze(y, axis=0)
        return y
Beispiel #5
0
 def _apply(self, x):
     if K.ndim(x) != self.conv.ndim + 2:
         raise ValueError(
             'Input has %d dimensions, but this Ops require %d-D '
             'tensor.' % (K.ndim(x), self.conv.ndim + 2))
     # ====== prepare the deconvolution ====== #
     stride = self.conv.strides
     border_mode = self.conv.pad
     W = self.conv.W
     dilation = self.conv.dilation
     # if Dilated Convolution, must transpose the Weights
     if self.conv.ndim == 2:
         deconv_func = K.deconv2d
     elif self.conv.ndim == 3:
         deconv_func = K.deconv3d
     else:
         raise Exception('No support for %d-D input in TransposedConv' %
                         self.conv.ndim)
     # theano require batch_dims is Constant or None, but tensorflow
     # require batch_dims is a native TensorVariable
     conved = deconv_func(
         x,
         kernel=W,
         output_shape=K.get_shape(
             self.conv._last_input,
             native=True if K.backend() == 'tensorflow' else False),
         strides=stride,
         border_mode=border_mode,
         filter_dilation=dilation)
     if hasattr(self, 'b'):
         if self.conv.untie_biases:
             conved += K.expand_dims(self.b, 0)
         else:
             conved += K.dimshuffle(self.b,
                                    ('x', ) * (self.conv.ndim + 1) + (0, ))
     activated = self.conv.activation(conved)
     K.add_shape(activated, self.conv.input_shape)
     return activated
Beispiel #6
0
    def normalize(self, scores):
        r""" Normalize attention scores using "fro"-norm that encouraging diversity
    among attention heads math::`P = ||A^T*A - I||_2^2` (Kim et al. 2017)

    Arguments:
      scores: Tensor with shape `[batch_size * num_heads, Tq, Tv]`
    """
        # it is easier to assume there is always 1-head at least
        num_heads = _get_num_heads(scores)
        if num_heads == 0:
            return bk.cast(0., scores.dtype)
        # [batch_size, num_heads, Tq * Tv]
        scoresT = bk.swapaxes(bk.reshape(scores, shape=([0], [1], -1)), 0, 1)
        # [batch_size, Tq * Tv, num_heads]
        scores = bk.swapaxes(scoresT, 1, 2)
        # [batch_size, num_heads, num_heads]
        A = bk.matmul(scoresT, scores)
        # [batch_size, num_heads, num_heads]
        I = bk.eye(num_heads, dtype=A.dtype)
        I = bk.expand_dims(I, axis=0)
        I = bk.tile(I, reps=A.shape[0], axis=0)
        # normalized
        P = bk.norm(A - I, p="fro")**2
        return P
Beispiel #7
0
    def align(self,
              scores,
              value,
              query=None,
              q_mask=None,
              v_mask=None,
              causal=False,
              residual=False,
              dropout=0,
              temporal_dropout=False,
              sample_shape=1,
              temperature=0.5,
              training=None):
        r"""Applies attention scores to the given value tensor.

    Arguments:
      scores: Attention Scores float tensor of shape
        `[num_heads, batch_size, Tq, Tv]`.
      value: Value (or source sequence) tensor of shape
        `[num_heads, batch_size, Tv, dim]`.
      query: Query (or target sequence) tensor of shape
        `[num_heads, batch_size, Tq, dim]`.
      q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
      dropout : Float. Dropout probability of the attention scores.
      temporal_dropout : Boolean. If `True`, using the same dropout mask along
        temporal axis (i.e. the 1-st dimension)
      sample_shape (`Integer`) : number of mcmc samples for estimating the gradient
        of hard attention
      temperature: An 0-D `Tensor`, representing the temperature
        of a set of RelaxedOneHotCategorical distributions. The temperature
        should be positive.

    Returns:
      attended sequence: Tensor of shape
        * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads)
        * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head)
        * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads)
        * `[batch_size, Tq, dim]` for (soft + no-head)
      attention distribution : for soft attention, return Tensor of shape
        * `[num_heads, batch_size, Tq]` for self-attention
        * `[num_heads, batch_size, Tq, Tv]` for inter-attention.
        for hard attention, return one-hot categorical distribution of shape
        * `[sample_shape, num_heads, batch_size, Tq]` for self-attention
        * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention.
        if multi-heads attention wasn't used, omit the `[num_heads]`.
    """
        num_heads = _get_num_heads(scores)
        if num_heads == 0:
            Tq = scores.shape[1]
            Tv = scores.shape[2]
        else:
            Tq = scores.shape[2]
            Tv = scores.shape[3]
        if value is None:
            if query is None:
                raise ValueError("both query and value are None, "
                                 "at least one of them must be given")
            value = query
        # ====== Causal mask ====== #
        if causal:
            # Creates a lower triangular mask, so position i cannot attend to
            # positions j>i. This prevents the flow of information from the future
            # into the past.
            scores_shape = scores.shape
            # causal_mask_shape = [1, Tq, Tv].
            causal_mask_shape = bk.concatenate(
                [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0)
            causal_mask = bk.tril_mask(causal_mask_shape)
        else:
            causal_mask = None
        if v_mask is not None:
            # LocalM applied
            if PosLocalM in self:
                v_mask = v_mask[:, -Tv:]
            # Mask of shape [batch_size, 1, Tv].
            v_mask = bk.expand_dims(v_mask, axis=-2)
            v_mask = bk.cast(v_mask, 'bool')
            if num_heads > 0:
                v_mask = bk.expand_dims(v_mask, axis=0)
        scores_mask = bk.logical_and(v_mask, causal_mask)
        ### applying the scores mask
        if scores_mask is not None:
            padding_mask = bk.logical_not(scores_mask)
            # Bias so padding positions do not contribute to attention distribution.
            scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype)
        # ====== convert attention score to distribution ====== #
        # if the last dimension is 1, no point for applying softmax, hence,
        # softmax to the second last dimension
        ### soft attention
        if AlignSoft in self:
            attention_distribution = bk.softmax(
                scores, axis=-2 if scores.shape[-1] == 1 else -1)
        ### relaxed hard attention
        elif AlignRelax in self:
            attention_distribution = bay.distributions.RelaxedOneHotCategorical(
                temperature=temperature,
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        ### hard attention
        elif AlignHard in self:
            attention_distribution = bay.distributions.OneHotCategorical(
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores,
                dtype=value.dtype)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        # ======  dropout the attention scores ====== #
        attention = bk.dropout(attention_distribution,
                               p_drop=dropout,
                               axis=1 if temporal_dropout else None,
                               training=training and dropout > 0)
        # ====== applying the attention ====== #
        if self.is_self_attention and ScoreLocation in self:
            result = bk.expand_dims(bk.array(attention), axis=-1) * value  \
                if attention.shape[-1] != 1 else \
                  attention * value
        else:
            if PosLocalM in self:
                value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:]
            result = bk.matmul(attention, value)
        # ====== applying the Query mask ====== #
        if q_mask is not None:
            assert q_mask.shape[1] == Tq,\
              "Query mask has time dimension %d, but query has time dimension %d" \
                % (q_mask.shape[1], Tq)
            # Mask of shape [batch_size, Tq, 1].
            q_mask = bk.expand_dims(q_mask, axis=-1)
            result *= bk.cast(q_mask, dtype=result.dtype)
        # ====== residual connection ====== #
        if residual:
            if query is None:
                raise ValueError("query must be given for residual connection")
            result += query
        # ====== return ====== #
        return result, attention_distribution
Beispiel #8
0
    def score(self,
              query,
              key=None,
              scale=1,
              window_width=None,
              q_proj=None,
              target_proj=None):
        r"""
    Arguments:
      query: Query (or target sequence) tensor of shape
        `[batch_size, Tq, dim]` or `[num_heads, batch_size, Tq, dim]` in case
        of multi-heads attention.
      key: Key (or source sequence) tensor of shape
        `[batch_size, Tv, dim]` or `[num_heads, batch_size, Tv, dim]` in case
        of multi-heads attention.
      scale: single `Scalar` or `Tensor` of shape `[dim]` for scaling
        the attention scores, suggested `1/sqrt(dim)` in (Vaswani et al. 2017).
      window_width : `None`, `Integer` or `Float` ([0, 1]). The total number of
        frames for a single window in local attention (i.e. `left + 1 + right`)
        Can be given as a fixed number of frames (`int`), or percentage of
        the sequence length (`float`). If `None`, use `Tq`
      q_proj : `Dense`, instance of dense or fully connected layer
        - for `ScoreLocation`, the number of hidden unit is `1`
        - for `ScoreGeneral`, the number of hidden unit is `dim`
      target_proj : `Dense`, for predictive local attention, applying
        a fully connected network on target sequence (i.e. the query) to
        predict the position on source sequence (i.e. the key).
        The layer must has output dimension equal to 1 and return logit value.

    Returns:
      Tensor of shape `[num_heads, batch_size, Tq, Tv]`, or
       `[num_heads, batch_size, Tq, 1]` if `ScoreLocation`
    """
        ### Check if multi-head attention is used
        num_heads = _get_num_heads(query)
        if num_heads > 0:
            query = bk.reshape(query, [-1] + [i for i in query.shape[2:]])
            if key is not None:
                key = bk.reshape(key, [-1] + [i for i in key.shape[2:]])
        Tq = query.shape[1]
        Tv = Tq if key is None else key.shape[1]
        # scale shape is `[]` or `[dim]`
        scale = bk.array(scale, dtype=query.dtype)
        ### Check the window width
        if window_width is None:
            window_width = Tq
        elif window_width < 1:
            window_width = window_width * Tv
        window_width = int(window_width)
        ### Locative attention
        if AttentionMechanism.ScoreLocation in self:
            if PosLocalM in self or PosLocalP in self:
                raise NotImplementedError(
                    "ScoreLocation only support Global attention, but given: %s"
                    % str(self))
            # [batch_size * num_heads, Tq, dim]
            scores = bk.reduce_mean(scale) * q_proj(query)
            assert scores.shape[-1] == 1, \
              " q_proj must have only 1 hidden unit, but given %d" % scores.shape[-1]
        ### Other score mode need the key tensor
        else:
            if key is None:
                raise ValueError(
                    "key must be provided for attention type: %s" % str(self))
            ### Attention position (local or global)
            if PosLocalM in self:
                key = key[:, -window_width:]
            elif PosLocalP in self:
                pt = bk.sigmoid(target_proj(bk.reshape(query, ([0], -1))))
                assert pt.shape[-1] == 1, \
                  "target_proj must project the query [., Tq * dim] to [., 1], i.e. " + \
                    "predicting the attention position on source sequence using " + \
                      "knowledge from target sequence."
                pt = Tv * pt  # `[batch_size * num_heads, 1]`
                # `[batch_size * num_heads, Tv]`
                # Eq (10) (Luong et al. 2015)
                gauss_est = bk.exp(
                    -bk.square(bk.arange(Tv, dtype=pt.dtype) - pt) /
                    (2 * bk.square(window_width / 2)))
                # `[batch_size * num_heads, 1, Tv]`
                gauss_est = bk.expand_dims(gauss_est, axis=1)
            ### Additive or concat method
            if AttentionMechanism.ScoreAdditive in self:
                # [batch_size * num_heads, Tq, 1, dim]
                q = bk.expand_dims(query, axis=2)
                # [batch_size * num_heads, 1, Tv, dim]
                k = bk.expand_dims(key, axis=1)
                # [batch_size * num_heads, Tq, Tv]
                scores = bk.reduce_sum(scale * bk.tanh(q + k), axis=-1)
            ### Dot product or multiplicative scoring
            elif AttentionMechanism.ScoreDotProd in self:
                # this is a trick to make attention_scale broadcastable when
                # scale_tied=False
                scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2))
            ### cosine scoring
            elif AttentionMechanism.ScoreCosine in self:
                # [batch_size * num_heads, Tq, 1, dim]
                q = bk.expand_dims(query, axis=2)
                # [batch_size * num_heads, 1, Tv, dim]
                k = bk.expand_dims(key, axis=1)
                # [batch_size * num_heads, Tq, Tv, dim]
                scores = (q * k) / (bk.norm(q, p=2) * bk.norm(k, p=2))
                scores = bk.reduce_sum(scale * scores, axis=-1, keepdims=False)
            ### general method with only project on the query
            elif AttentionMechanism.ScoreGeneral in self:
                query = q_proj(query)
                assert query.shape[-1] == key.shape[-1], \
                  " q_proj must have %d hidden units, but given %d units" % \
                    (key.shape[-1], query.shape[-1])
                scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2))
            else:
                raise NotImplementedError(
                    "No support for attention_type='%s'" % str(self))
            ### applying the local-predictive attention
            if PosLocalP in self:
                scores = scores * gauss_est
        ### get back the multi-heads shape
        if num_heads > 0:
            scores = bk.reshape(scores,
                                shape=[num_heads, -1] +
                                [i for i in scores.shape[1:]])
        return scores