def call(self, inputs, training=None, **kwargs): inputs, memory = inputs batch_size = K.shape(inputs)[0] seq_len = K.shape(inputs)[1] mem_mask = K.tile(K.ones_like(memory[:, :, :1], dtype=K.floatx()), [1, 1, seq_len]) # Build content mask with random permutation ranges = K.tile(K.expand_dims(K.arange(0, seq_len), axis=-1), [1, batch_size]) if self.enabled: shuffle = random_shuffle(ranges) else: shuffle = ranges if self.directional: shuffled = K.in_train_phase(shuffle, ranges, training) else: if self.enabled: shuffled = K.in_train_phase(shuffle, ranges + seq_len, training) else: shuffled = ranges + seq_len ranges = K.expand_dims(K.permute_dimensions(ranges, [1, 0]), axis=-1) shuffled = K.expand_dims(K.permute_dimensions(shuffled, [1, 0]), axis=1) content_mask = K.cast(ranges <= shuffled, dtype=K.floatx()) # Build query mask based on content mask ranges = K.arange(0, seq_len) eye = K.equal(K.expand_dims(ranges, axis=0), K.expand_dims(ranges, axis=-1)) eye = K.expand_dims(K.cast(eye, dtype=K.floatx()), axis=0) query_mask = content_mask * (1.0 - eye) content_mask = K.concatenate([mem_mask, content_mask], axis=1) query_mask = K.concatenate([mem_mask, query_mask], axis=1) return [ K.permute_dimensions(content_mask, [0, 2, 1]), K.permute_dimensions(query_mask, [0, 2, 1]), ]
def call(self, inputs, mask=None, **kwargs): if isinstance(inputs, list): query, key, value = inputs else: query = key = value = inputs if isinstance(mask, list): mask = mask[1] feature_dim = K.shape(query)[-1] e = K.batch_dot(query, key, axes=2) / K.sqrt( K.cast(feature_dim, dtype=K.floatx())) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) if self.history_only: query_len, key_len = K.shape(query)[1], K.shape(key)[1] indices = K.expand_dims(K.arange(0, key_len), axis=0) upper = K.expand_dims(K.arange(0, query_len), axis=-1) e *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0) if mask is not None: e *= K.cast(K.expand_dims(mask, axis=-2), K.floatx()) s = K.sum(e, axis=-1, keepdims=True) s += K.cast(K.equal(s, 0.0), K.floatx()) a = e / s v = K.batch_dot(a, value) if self.return_attention: return [v, a] return v
def _attention_regularizer(self, attention): batch_size = K.cast(K.shape(attention)[0], K.floatx()) input_len = K.shape(attention)[-1] indices = K.expand_dims(K.arange(0, input_len), axis=0) diagonal = K.expand_dims(K.arange(0, input_len), axis=-1) eye = K.cast(K.equal(indices, diagonal), K.floatx()) return self.attention_regularizer_weight * K.sum(K.square(K.batch_dot( attention, K.permute_dimensions(attention, (0, 2, 1))) - eye)) / batch_size
def call(self, inputs, **kwargs): length = K.shape(inputs[0])[1] + K.shape(inputs[1])[1] inputs = K.tile( K.expand_dims(K.arange(length - 1, -1, -1, dtype=K.floatx()), axis=0), [K.shape(inputs[0])[0], 1], ) if self.clamp_len is not None: inputs = K.clip(inputs, min_value=0, max_value=self.clamp_len) inputs = K.expand_dims(inputs, axis=-1) output_dim = K.cast(self.output_dim, K.floatx()) ranges = K.expand_dims(K.arange(0.0, self.output_dim, 2.0), axis=0) / output_dim inverse = 1.0 / K.pow(10000.0, ranges) positions = inputs * inverse return K.concatenate([K.sin(positions), K.cos(positions)], axis=-1)
def call(self, inputs, **kwargs): q_len, m_len = K.shape(inputs[0])[1], K.shape(inputs[1])[1] k_len = q_len + m_len start, stop = k_len, -1 if not self.directional: stop = -q_len inputs = K.tile( K.expand_dims(K.arange(start, stop, -1, dtype=K.floatx()), axis=0), [K.shape(inputs[0])[0], 1], ) if self.clamp_len is not None: inputs = K.clip(inputs, min_value=0, max_value=self.clamp_len) inputs = K.expand_dims(inputs, axis=-1) output_dim = K.cast(self.output_dim, K.floatx()) ranges = K.expand_dims(K.arange(0.0, self.output_dim, 2.0), axis=0) / output_dim inverse = 1.0 / K.pow(10000.0, ranges) positions = inputs * inverse return K.concatenate([K.sin(positions), K.cos(positions)], axis=-1)
def call(self, inputs, mask=None): input_shape = K.shape(inputs) if self.mode == self.MODE_ADD: batch_size, seq_len, output_dim = input_shape[0], input_shape[ 1], input_shape[2] pos_input = K.tile(K.expand_dims(K.arange(0, seq_len), axis=0), [batch_size, 1]) elif self.mode == self.MODE_CONCAT: batch_size, seq_len, output_dim = input_shape[0], input_shape[ 1], self.output_dim pos_input = K.tile(K.expand_dims(K.arange(0, seq_len), axis=0), [batch_size, 1]) else: output_dim = self.output_dim pos_input = inputs if K.dtype(pos_input) != K.floatx(): pos_input = K.cast(pos_input, K.floatx()) evens = K.arange(0, output_dim // 2) * 2 odds = K.arange(0, output_dim // 2) * 2 + 1 even_embd = K.sin( K.dot( K.expand_dims(pos_input, -1), K.expand_dims( 1.0 / K.pow( 10000.0, K.cast(evens, K.floatx()) / K.cast(output_dim, K.floatx())), 0))) odd_embd = K.cos( K.dot( K.expand_dims(pos_input, -1), K.expand_dims( 1.0 / K.pow( 10000.0, K.cast((odds - 1), K.floatx()) / K.cast(output_dim, K.floatx())), 0))) embd = K.stack([even_embd, odd_embd], axis=-1) output = K.reshape(embd, [-1, K.shape(inputs)[1], output_dim]) if self.mode == self.MODE_CONCAT: output = K.concatenate([inputs, output], axis=-1) if self.mode == self.MODE_ADD: output += inputs return output
def call(self, inputs, mask=None, **kwargs): input_len = K.shape(inputs)[1] if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD: e = self._call_additive_emission(inputs) elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL: e = self._call_multiplicative_emission(inputs) if self.attention_activation is not None: e = self.attention_activation(e) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) if self.attention_width is not None: if self.history_only: lower = K.arange(0, input_len) - (self.attention_width - 1) else: lower = K.arange(0, input_len) - self.attention_width // 2 lower = K.expand_dims(lower, axis=-1) upper = lower + self.attention_width indices = K.expand_dims(K.arange(0, input_len), axis=0) e = e * K.cast(lower <= indices, K.floatx()) * K.cast(indices < upper, K.floatx()) if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask) e = K.permute_dimensions(K.permute_dimensions(e * mask, (0, 2, 1)) * mask, (0, 2, 1)) # a_{t} = \text{softmax}(e_t) s = K.sum(e, axis=-1, keepdims=True) a = e / (s + K.epsilon()) # l_t = \sum_{t'} a_{t, t'} x_{t'} v = K.batch_dot(a, inputs) if self.attention_regularizer_weight > 0.0: self.add_loss(self._attention_regularizer(a)) if self.return_attention: return [v, a] return v