def testBuildFutureMaskWithMaxLen(self): num_heads = 4 length = [2, 4, 3] maximum_length = 5 expected = [ [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0]], [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0]], [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0]]] mask = transformer.build_future_mask( tf.constant(length), num_heads=num_heads, maximum_length=maximum_length) with self.test_session() as sess: mask = sess.run(mask) mask = np.transpose(mask, (1, 0, 2, 3)) for b in range(len(length)): self.assertAllEqual(expected, mask[b])
def testMaskedScaledDotAttention(self): batch_size = 3 num_heads = 8 queries_length = [8, 6, 10] depth = 20 queries = tf.placeholder_with_default( np.random.randn(batch_size, num_heads, max(queries_length), depth).astype(np.float32), shape=(None, num_heads, None, depth)) mask = transformer.build_future_mask(queries_length, num_heads=num_heads) context, attn = transformer.dot_product_attention( queries, queries, queries, tf.estimator.ModeKeys.PREDICT, mask=mask) with self.test_session() as sess: context, attn = sess.run([context, attn]) illegal_connections = np.triu_indices(max(queries_length), 1) for i in range(batch_size): for h in range(num_heads): self.assertEqual(0.0, np.sum(attn[i, h][illegal_connections]))
def _run(self, inputs, sequence_length=None, cache=None, memory=None, memory_sequence_length=None, step=None, training=None): # Process inputs. inputs *= self.num_units**0.5 if self.position_encoder is not None: inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None) inputs = common.dropout(inputs, self.dropout, training=training) # Prepare query mask. mask = None if sequence_length is not None: mask = transformer.build_future_mask( sequence_length, maximum_length=tf.shape(inputs)[1]) # Prepare memory mask. memory_mask = None if memory is not None: if not isinstance(memory, (list, tuple)): memory = (memory, ) if memory_sequence_length is not None: if not isinstance(memory_sequence_length, (list, tuple)): memory_sequence_length = (memory_sequence_length, ) memory_mask = [] for mem, mem_length in zip(memory, memory_sequence_length): mem_mask = tf.sequence_mask(mem_length, maxlen=tf.shape(mem)[1], dtype=tf.float32) mem_mask = tf.expand_dims(mem_mask, 1) memory_mask.append(mem_mask) # Run each layer. new_cache = [] for i, layer in enumerate(self.layers): inputs, layer_cache, attention = layer( inputs, mask=mask, memory=memory, memory_mask=memory_mask, cache=cache[i] if cache is not None else None, training=training) new_cache.append(layer_cache) outputs = self.layer_norm(inputs) return outputs, new_cache, attention
def testBuildFutureMask(self): num_heads = 4 length = [2, 4, 3] expected = [[[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]], [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 1.0]], [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 0.0]]] mask = transformer.build_future_mask(tf.constant(length), num_heads=num_heads) mask = self.evaluate(mask) self.assertTupleEqual(mask.shape, (len(length), 1, max(length), max(length))) self.assertAllEqual(np.squeeze(mask), expected)
def testBuildFutureMaskWithMaxLen(self): num_heads = 4 length = [2, 4, 3] maximum_length = 5 expected = [[[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0]], [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0]], [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0]]] mask = transformer.build_future_mask(tf.constant(length), num_heads=num_heads, maximum_length=maximum_length) with self.test_session() as sess: mask = sess.run(mask) self.assertTupleEqual( mask.shape, (len(length), 1, maximum_length, maximum_length)) self.assertAllEqual(np.squeeze(mask), expected)
def testMaskedScaledDotAttention(self): batch_size = 3 num_heads = 8 queries_length = [8, 6, 10] depth = 20 queries = tf.placeholder_with_default( np.random.randn(batch_size, num_heads, max(queries_length), depth).astype(np.float32), shape=(None, num_heads, None, depth)) mask = transformer.build_future_mask(queries_length, num_heads=num_heads) context, attn = transformer.dot_product_attention( queries, queries, queries, tf.estimator.ModeKeys.PREDICT, mask=mask) with self.test_session() as sess: context, attn = sess.run([context, attn]) illegal_connections = np.triu_indices(max(queries_length), 1) for i in range(batch_size): for h in range(num_heads): self.assertEqual(0.0, np.sum(attn[i, h][illegal_connections]))
def _self_attention_stack(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN, cache=None, memory=None, memory_sequence_length=None): inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) decoder_mask = None memory_mask = None if sequence_length is not None: decoder_mask = transformer.build_future_mask( sequence_length, num_heads=self.num_heads, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype) if memory is not None: if cache is not None: memory_mask = cache["memory_mask"] elif memory_sequence_length is not None: memory_mask = self._build_memory_mask( memory, memory_sequence_length=memory_sequence_length) for l in range(self.num_layers): layer_name = "layer_{}".format(l) layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("masked_multi_head"): encoded = transformer.multi_head_attention( self.num_heads, transformer.norm(inputs), None, mode, num_units=self.num_units, mask=decoder_mask, cache=layer_cache, dropout=self.attention_dropout) encoded = transformer.drop_and_add(inputs, encoded, mode, dropout=self.dropout) if memory is not None: with tf.variable_scope("multi_head"): context = transformer.multi_head_attention( self.num_heads, transformer.norm(encoded), memory, mode, mask=memory_mask, cache=layer_cache, dropout=self.attention_dropout) context = transformer.drop_and_add( encoded, context, mode, dropout=self.dropout) with tf.variable_scope("ffn"): transformed = transformer.feed_forward( transformer.norm(context), self.ffn_inner_dim, mode, dropout=self.relu_dropout) transformed = transformer.drop_and_add( context, transformed, mode, dropout=self.dropout) inputs = transformed outputs = transformer.norm(inputs) return outputs
def _self_attention_stack(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN, cache=None, memory=None, memory_sequence_length=None, step=None): inputs *= self.num_units**0.5 if self.position_encoder is not None: if step is None: inputs = self.position_encoder(inputs, sequence_length=sequence_length) else: inputs = self.position_encoder.apply_one(inputs, step + 1) inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) decoder_mask = None memory_mask = None last_attention = None if self.self_attention_type == "scaled_dot": if sequence_length is not None: decoder_mask = transformer.build_future_mask( sequence_length, num_heads=self.num_heads, maximum_length=tf.shape(inputs)[1]) elif self.self_attention_type == "average": if cache is None: if sequence_length is None: sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1]) decoder_mask = transformer.cumulative_average_mask( sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype) if memory is not None and memory_sequence_length is not None: memory_mask = transformer.build_sequence_mask( memory_sequence_length, num_heads=self.num_heads, maximum_length=tf.shape(memory)[1]) for l in range(self.num_layers): layer_name = "layer_{}".format(l) layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): if self.self_attention_type == "scaled_dot": with tf.variable_scope("masked_multi_head"): encoded = transformer.multi_head_attention( self.num_heads, transformer.norm(inputs), None, mode, num_units=self.num_units, mask=decoder_mask, cache=layer_cache, dropout=self.attention_dropout) encoded = transformer.drop_and_add( inputs, encoded, mode, dropout=self.dropout) elif self.self_attention_type == "average": with tf.variable_scope("average_attention"): # Cumulative average. x = transformer.norm(inputs) y = transformer.cumulative_average( x, decoder_mask if cache is None else step, cache=layer_cache) # FFN. y = transformer.feed_forward( y, self.ffn_inner_dim, mode, dropout=self.relu_dropout) # Gating layer. z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2) i, f = tf.split(z, 2, axis=-1) y = tf.sigmoid(i) * x + tf.sigmoid(f) * y encoded = transformer.drop_and_add( inputs, y, mode, dropout=self.dropout) if memory is not None: with tf.variable_scope("multi_head"): context, last_attention = transformer.multi_head_attention( self.num_heads, transformer.norm(encoded), memory, mode, mask=memory_mask, cache=layer_cache, dropout=self.attention_dropout, return_attention=True) context = transformer.drop_and_add( encoded, context, mode, dropout=self.dropout) else: context = encoded with tf.variable_scope("ffn"): transformed = transformer.feed_forward( transformer.norm(context), self.ffn_inner_dim, mode, dropout=self.relu_dropout) transformed = transformer.drop_and_add( context, transformed, mode, dropout=self.dropout) inputs = transformed if last_attention is not None: # The first head of the last layer is returned. first_head_attention = last_attention[:, 0] else: first_head_attention = None outputs = transformer.norm(inputs) return outputs, first_head_attention
def _self_attention_stack(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN, cache=None, memory=None, memory_sequence_length=None): inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) decoder_mask = None memory_mask = None if sequence_length is not None: decoder_mask = transformer.build_future_mask( sequence_length, num_heads=self.num_heads, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype) if memory is not None: if cache is not None: memory_mask = cache["memory_mask"] elif memory_sequence_length is not None: memory_mask = self._build_memory_mask( memory, memory_sequence_length=memory_sequence_length) for l in range(self.num_layers): layer_name = "layer_{}".format(l) layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("masked_multi_head"): encoded = transformer.multi_head_attention( self.num_heads, transformer.norm(inputs), None, mode, num_units=self.num_units, mask=decoder_mask, cache=layer_cache, dropout=self.attention_dropout) encoded = transformer.drop_and_add( inputs, encoded, mode, dropout=self.dropout) if memory is not None: with tf.variable_scope("multi_head"): context = transformer.multi_head_attention( self.num_heads, transformer.norm(encoded), memory, mode, mask=memory_mask, cache=layer_cache, dropout=self.attention_dropout) context = transformer.drop_and_add( encoded, context, mode, dropout=self.dropout) with tf.variable_scope("ffn"): transformed = transformer.feed_forward( transformer.norm(context), self.ffn_inner_dim, mode, dropout=self.relu_dropout) transformed = transformer.drop_and_add( context, transformed, mode, dropout=self.dropout) inputs = transformed outputs = transformer.norm(inputs) return outputs
def _self_attention_stack(self, inputs, # batch, max_dec_len, emb_dim sequence_length=None, # [batch] mode=tf.estimator.ModeKeys.TRAIN, cache=None, memory=None, # [batch, enc_len, num_units] memory_sequence_length=None, # [batch] step=None): inputs *= self.num_units ** 0.5 if self.position_encoder is not None: inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None) # inputs [batch, max_dec_len, emb_dim] inputs = tf.layers.dropout( # batch, max_dec_len, emb_dim inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) decoder_mask = None memory_mask = None last_attention = None if self.self_attention_type == "scaled_dot": if sequence_length is not None: # sequence_length is None when decode, not None at train decoder_mask = transformer.build_future_mask( # [batch, 1, max_dec_len, max_dec_len] sequence_length, num_heads=self.num_heads, maximum_length=tf.shape(inputs)[1]) elif self.self_attention_type == "average": if cache is None: if sequence_length is None: sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1]) decoder_mask = transformer.cumulative_average_mask( sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype) if memory is not None and not tf.contrib.framework.nest.is_sequence(memory): memory = (memory,) if memory_sequence_length is not None: if not tf.contrib.framework.nest.is_sequence(memory_sequence_length): memory_sequence_length = (memory_sequence_length,) memory_mask = [ # [batch, 1, 1, enc_len] transformer.build_sequence_mask( length, num_heads=self.num_heads, maximum_length=tf.shape(m)[1]) for m, length in zip(memory, memory_sequence_length)] for l in range(self.num_layers): layer_name = "layer_{}".format(l) layer_cache = cache[layer_name] if cache is not None else None # train的时候没有cache,decode的时候有cache with tf.variable_scope(layer_name): # self attention encode the decoder input (training) or last step output (decode) if self.self_attention_type == "scaled_dot": with tf.variable_scope("masked_multi_head"): encoded = transformer.multi_head_attention( # [batch, decode_len, hidden] self.num_heads, transformer.norm(inputs), None, mode, num_units=self.num_units, mask=decoder_mask, # [batch, 1, len, len] cache=layer_cache, dropout=self.attention_dropout) last_context = transformer.drop_and_add( # [batch, decode_len, hidden] inputs, encoded, mode, dropout=self.dropout) elif self.self_attention_type == "average": with tf.variable_scope("average_attention"): # Cumulative average. x = transformer.norm(inputs) y = transformer.cumulative_average( x, decoder_mask if cache is None else step, cache=layer_cache) # FFN. y = transformer.feed_forward( y, self.ffn_inner_dim, mode, dropout=self.relu_dropout) # Gating layer. z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2) i, f = tf.split(z, 2, axis=-1) y = tf.sigmoid(i) * x + tf.sigmoid(f) * y last_context = transformer.drop_and_add( inputs, y, mode, dropout=self.dropout) # attending to encoder memory using decoder context if memory is not None: for i, (mem, mask) in enumerate(zip(memory, memory_mask)): memory_cache = layer_cache["memory"][ i] if layer_cache is not None else None # train的时候没有cache,decode的时候有cache with tf.variable_scope("multi_head" if i == 0 else "multi_head_%d" % i): context, last_attention = transformer.multi_head_attention( self.num_heads, transformer.norm(last_context), mem, # [batch, enc_len, dim] mode, mask=mask, # [batch, 1, 1, len] cache=memory_cache, dropout=self.attention_dropout, return_attention=True) # context [batch, decode_len, num_units], last_attention train[batch, head, dec_len, enc_len] last_context = transformer.drop_and_add( last_context, # [batch, decode_len, num_units] context, mode, dropout=self.dropout) if i > 0: # Do not return attention in case of multi source. last_attention = None with tf.variable_scope("ffn"): transformed = transformer.feed_forward( # [batch, decode_len, num_units] transformer.norm(last_context), self.ffn_inner_dim, mode, dropout=self.relu_dropout) transformed = transformer.drop_and_add( # [batch, decode_len, num_units] last_context, transformed, mode, dropout=self.dropout) inputs = transformed if last_attention is not None: # The first head of the last layer is returned. first_head_attention = last_attention[:, 0] else: first_head_attention = None outputs = transformer.norm(inputs) # [batch, decode_len, num_units] return outputs, first_head_attention