def call(self, sequence, training=None): with bk.framework_(self): # [batch_size, time_dim] positions = bk.tile( bk.expand_dims(bk.arange(sequence.shape[1]), 0), [sequence.shape[0], 1]) dtype = bk.dtype_universal(positions.dtype) if dtype not in ('int32', 'int64'): positions = bk.cast(positions, dtype='int32') pe = bk.embedding(indices=positions, weight=self.position_encoding) return pe
def _apply(self, x): # store last input for deconvolution ops self._last_input = x conved = self.convolve(x) output_shape = K.get_shape(conved) if not hasattr(self, 'b'): conved = conved elif self.untie_biases: conved += K.expand_dims(self.b, 0) else: conved += K.dimshuffle(self.b, ('x', ) * (self.ndim + 1) + (0, )) activated = self.activation(conved) K.add_shape(activated, output_shape) # set shape for output return activated
def _apply(self, X, h0=None, c0=None, mask=None, **kwargs): # check input_shape input_shape = K.get_shape(X) # ====== check mask ====== # if mask is not None and (K.ndim(mask) != 2 or K.get_shape(mask)[-1] != input_shape[1]): raise Exception('Mask must be a 2-D matrix and the time dimension ' '(i.e. the second dimension) must equal to "%d"' ', but the given mask has shape "%s".' % (input_shape[1], K.get_shape(mask))) # add broadcastable dimension for mask if mask is not None: mask = K.expand_dims(mask, dim=-1) # ====== initialize states ====== # # hidden states h0 = _check_rnn_hidden_states(h0, self, input_shape, 'h0') c0 = _check_rnn_hidden_states(c0, self, input_shape, 'c0') # turn off repeat_states if batch_size already included if K.get_shape(h0)[0] != 1 and K.get_shape(c0)[0] != 1: self.repeat_states = False # ====== precompute input ====== # # linear or norm input mode if self.input_mode != 'skip': X = K.dot(X, self.W_in) if self.input_mode == 'norm': # normalize all axes except the time dimension bn = BatchNorm(axes=(0, 1), activation=K.linear, gamma_init=self.gamma, beta_init=self.beta, mean_init=self.mean, inv_std_init=self.inv_std) X = bn(X) # skip input elif input_shape[-1] == self.num_units: X = K.repeat(X, 4, axes=-1) # ====== compute recurrent output ====== # out = self._rnn(X, h0=h0, c0=c0, mask=mask, **self.get_recurrent_info(kwargs)) if not self.return_cell_memory: out = out[:-1] for i in out: K.add_shape(i, shape=input_shape[:-1] + (self.num_units, )) # only care about the first state return out[0] if len(out) == 1 else out
def call(self, inputs, training=None): # anyway, if the smallest value is negative, # start from 0 (i.e. relative position) shape = inputs.shape timestep = shape[1] y = [] for delay, layer in zip(self.delays, self.all_layers): start = delay end = timestep - self.context_length + delay + 1 - self.min_delay y.append(expand_dims(layer(inputs[:, start:end]), axis=0)) y = concatenate(y, axis=0) y = self.fn_pooling(y, axis=0) if isinstance(self.pooling, string_types) and \ 'none' in self.pooling.lower() and \ self.context_length == 1: y = squeeze(y, axis=0) return y
def _apply(self, x): if K.ndim(x) != self.conv.ndim + 2: raise ValueError( 'Input has %d dimensions, but this Ops require %d-D ' 'tensor.' % (K.ndim(x), self.conv.ndim + 2)) # ====== prepare the deconvolution ====== # stride = self.conv.strides border_mode = self.conv.pad W = self.conv.W dilation = self.conv.dilation # if Dilated Convolution, must transpose the Weights if self.conv.ndim == 2: deconv_func = K.deconv2d elif self.conv.ndim == 3: deconv_func = K.deconv3d else: raise Exception('No support for %d-D input in TransposedConv' % self.conv.ndim) # theano require batch_dims is Constant or None, but tensorflow # require batch_dims is a native TensorVariable conved = deconv_func( x, kernel=W, output_shape=K.get_shape( self.conv._last_input, native=True if K.backend() == 'tensorflow' else False), strides=stride, border_mode=border_mode, filter_dilation=dilation) if hasattr(self, 'b'): if self.conv.untie_biases: conved += K.expand_dims(self.b, 0) else: conved += K.dimshuffle(self.b, ('x', ) * (self.conv.ndim + 1) + (0, )) activated = self.conv.activation(conved) K.add_shape(activated, self.conv.input_shape) return activated
def normalize(self, scores): r""" Normalize attention scores using "fro"-norm that encouraging diversity among attention heads math::`P = ||A^T*A - I||_2^2` (Kim et al. 2017) Arguments: scores: Tensor with shape `[batch_size * num_heads, Tq, Tv]` """ # it is easier to assume there is always 1-head at least num_heads = _get_num_heads(scores) if num_heads == 0: return bk.cast(0., scores.dtype) # [batch_size, num_heads, Tq * Tv] scoresT = bk.swapaxes(bk.reshape(scores, shape=([0], [1], -1)), 0, 1) # [batch_size, Tq * Tv, num_heads] scores = bk.swapaxes(scoresT, 1, 2) # [batch_size, num_heads, num_heads] A = bk.matmul(scoresT, scores) # [batch_size, num_heads, num_heads] I = bk.eye(num_heads, dtype=A.dtype) I = bk.expand_dims(I, axis=0) I = bk.tile(I, reps=A.shape[0], axis=0) # normalized P = bk.norm(A - I, p="fro")**2 return P
def align(self, scores, value, query=None, q_mask=None, v_mask=None, causal=False, residual=False, dropout=0, temporal_dropout=False, sample_shape=1, temperature=0.5, training=None): r"""Applies attention scores to the given value tensor. Arguments: scores: Attention Scores float tensor of shape `[num_heads, batch_size, Tq, Tv]`. value: Value (or source sequence) tensor of shape `[num_heads, batch_size, Tv, dim]`. query: Query (or target sequence) tensor of shape `[num_heads, batch_size, Tq, dim]`. q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`. If given, the output will be zero at the positions where `mask==False`. v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`. If given, will apply the mask such that values at positions where `mask==False` do not contribute to the result. dropout : Float. Dropout probability of the attention scores. temporal_dropout : Boolean. If `True`, using the same dropout mask along temporal axis (i.e. the 1-st dimension) sample_shape (`Integer`) : number of mcmc samples for estimating the gradient of hard attention temperature: An 0-D `Tensor`, representing the temperature of a set of RelaxedOneHotCategorical distributions. The temperature should be positive. Returns: attended sequence: Tensor of shape * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads) * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head) * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads) * `[batch_size, Tq, dim]` for (soft + no-head) attention distribution : for soft attention, return Tensor of shape * `[num_heads, batch_size, Tq]` for self-attention * `[num_heads, batch_size, Tq, Tv]` for inter-attention. for hard attention, return one-hot categorical distribution of shape * `[sample_shape, num_heads, batch_size, Tq]` for self-attention * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention. if multi-heads attention wasn't used, omit the `[num_heads]`. """ num_heads = _get_num_heads(scores) if num_heads == 0: Tq = scores.shape[1] Tv = scores.shape[2] else: Tq = scores.shape[2] Tv = scores.shape[3] if value is None: if query is None: raise ValueError("both query and value are None, " "at least one of them must be given") value = query # ====== Causal mask ====== # if causal: # Creates a lower triangular mask, so position i cannot attend to # positions j>i. This prevents the flow of information from the future # into the past. scores_shape = scores.shape # causal_mask_shape = [1, Tq, Tv]. causal_mask_shape = bk.concatenate( [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0) causal_mask = bk.tril_mask(causal_mask_shape) else: causal_mask = None if v_mask is not None: # LocalM applied if PosLocalM in self: v_mask = v_mask[:, -Tv:] # Mask of shape [batch_size, 1, Tv]. v_mask = bk.expand_dims(v_mask, axis=-2) v_mask = bk.cast(v_mask, 'bool') if num_heads > 0: v_mask = bk.expand_dims(v_mask, axis=0) scores_mask = bk.logical_and(v_mask, causal_mask) ### applying the scores mask if scores_mask is not None: padding_mask = bk.logical_not(scores_mask) # Bias so padding positions do not contribute to attention distribution. scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype) # ====== convert attention score to distribution ====== # # if the last dimension is 1, no point for applying softmax, hence, # softmax to the second last dimension ### soft attention if AlignSoft in self: attention_distribution = bk.softmax( scores, axis=-2 if scores.shape[-1] == 1 else -1) ### relaxed hard attention elif AlignRelax in self: attention_distribution = bay.distributions.RelaxedOneHotCategorical( temperature=temperature, logits=bk.squeeze(scores, axis=-1) if scores.shape[-1] == 1 else scores) fsample = partial(bay.Distribution.sample, sample_shape=sample_shape) attention_distribution = bay.coercible_tensor( attention_distribution, convert_to_tensor_fn=fsample) ### hard attention elif AlignHard in self: attention_distribution = bay.distributions.OneHotCategorical( logits=bk.squeeze(scores, axis=-1) if scores.shape[-1] == 1 else scores, dtype=value.dtype) fsample = partial(bay.Distribution.sample, sample_shape=sample_shape) attention_distribution = bay.coercible_tensor( attention_distribution, convert_to_tensor_fn=fsample) # ====== dropout the attention scores ====== # attention = bk.dropout(attention_distribution, p_drop=dropout, axis=1 if temporal_dropout else None, training=training and dropout > 0) # ====== applying the attention ====== # if self.is_self_attention and ScoreLocation in self: result = bk.expand_dims(bk.array(attention), axis=-1) * value \ if attention.shape[-1] != 1 else \ attention * value else: if PosLocalM in self: value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:] result = bk.matmul(attention, value) # ====== applying the Query mask ====== # if q_mask is not None: assert q_mask.shape[1] == Tq,\ "Query mask has time dimension %d, but query has time dimension %d" \ % (q_mask.shape[1], Tq) # Mask of shape [batch_size, Tq, 1]. q_mask = bk.expand_dims(q_mask, axis=-1) result *= bk.cast(q_mask, dtype=result.dtype) # ====== residual connection ====== # if residual: if query is None: raise ValueError("query must be given for residual connection") result += query # ====== return ====== # return result, attention_distribution
def score(self, query, key=None, scale=1, window_width=None, q_proj=None, target_proj=None): r""" Arguments: query: Query (or target sequence) tensor of shape `[batch_size, Tq, dim]` or `[num_heads, batch_size, Tq, dim]` in case of multi-heads attention. key: Key (or source sequence) tensor of shape `[batch_size, Tv, dim]` or `[num_heads, batch_size, Tv, dim]` in case of multi-heads attention. scale: single `Scalar` or `Tensor` of shape `[dim]` for scaling the attention scores, suggested `1/sqrt(dim)` in (Vaswani et al. 2017). window_width : `None`, `Integer` or `Float` ([0, 1]). The total number of frames for a single window in local attention (i.e. `left + 1 + right`) Can be given as a fixed number of frames (`int`), or percentage of the sequence length (`float`). If `None`, use `Tq` q_proj : `Dense`, instance of dense or fully connected layer - for `ScoreLocation`, the number of hidden unit is `1` - for `ScoreGeneral`, the number of hidden unit is `dim` target_proj : `Dense`, for predictive local attention, applying a fully connected network on target sequence (i.e. the query) to predict the position on source sequence (i.e. the key). The layer must has output dimension equal to 1 and return logit value. Returns: Tensor of shape `[num_heads, batch_size, Tq, Tv]`, or `[num_heads, batch_size, Tq, 1]` if `ScoreLocation` """ ### Check if multi-head attention is used num_heads = _get_num_heads(query) if num_heads > 0: query = bk.reshape(query, [-1] + [i for i in query.shape[2:]]) if key is not None: key = bk.reshape(key, [-1] + [i for i in key.shape[2:]]) Tq = query.shape[1] Tv = Tq if key is None else key.shape[1] # scale shape is `[]` or `[dim]` scale = bk.array(scale, dtype=query.dtype) ### Check the window width if window_width is None: window_width = Tq elif window_width < 1: window_width = window_width * Tv window_width = int(window_width) ### Locative attention if AttentionMechanism.ScoreLocation in self: if PosLocalM in self or PosLocalP in self: raise NotImplementedError( "ScoreLocation only support Global attention, but given: %s" % str(self)) # [batch_size * num_heads, Tq, dim] scores = bk.reduce_mean(scale) * q_proj(query) assert scores.shape[-1] == 1, \ " q_proj must have only 1 hidden unit, but given %d" % scores.shape[-1] ### Other score mode need the key tensor else: if key is None: raise ValueError( "key must be provided for attention type: %s" % str(self)) ### Attention position (local or global) if PosLocalM in self: key = key[:, -window_width:] elif PosLocalP in self: pt = bk.sigmoid(target_proj(bk.reshape(query, ([0], -1)))) assert pt.shape[-1] == 1, \ "target_proj must project the query [., Tq * dim] to [., 1], i.e. " + \ "predicting the attention position on source sequence using " + \ "knowledge from target sequence." pt = Tv * pt # `[batch_size * num_heads, 1]` # `[batch_size * num_heads, Tv]` # Eq (10) (Luong et al. 2015) gauss_est = bk.exp( -bk.square(bk.arange(Tv, dtype=pt.dtype) - pt) / (2 * bk.square(window_width / 2))) # `[batch_size * num_heads, 1, Tv]` gauss_est = bk.expand_dims(gauss_est, axis=1) ### Additive or concat method if AttentionMechanism.ScoreAdditive in self: # [batch_size * num_heads, Tq, 1, dim] q = bk.expand_dims(query, axis=2) # [batch_size * num_heads, 1, Tv, dim] k = bk.expand_dims(key, axis=1) # [batch_size * num_heads, Tq, Tv] scores = bk.reduce_sum(scale * bk.tanh(q + k), axis=-1) ### Dot product or multiplicative scoring elif AttentionMechanism.ScoreDotProd in self: # this is a trick to make attention_scale broadcastable when # scale_tied=False scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2)) ### cosine scoring elif AttentionMechanism.ScoreCosine in self: # [batch_size * num_heads, Tq, 1, dim] q = bk.expand_dims(query, axis=2) # [batch_size * num_heads, 1, Tv, dim] k = bk.expand_dims(key, axis=1) # [batch_size * num_heads, Tq, Tv, dim] scores = (q * k) / (bk.norm(q, p=2) * bk.norm(k, p=2)) scores = bk.reduce_sum(scale * scores, axis=-1, keepdims=False) ### general method with only project on the query elif AttentionMechanism.ScoreGeneral in self: query = q_proj(query) assert query.shape[-1] == key.shape[-1], \ " q_proj must have %d hidden units, but given %d units" % \ (key.shape[-1], query.shape[-1]) scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2)) else: raise NotImplementedError( "No support for attention_type='%s'" % str(self)) ### applying the local-predictive attention if PosLocalP in self: scores = scores * gauss_est ### get back the multi-heads shape if num_heads > 0: scores = bk.reshape(scores, shape=[num_heads, -1] + [i for i in scores.shape[1:]]) return scores