def testBuildFutureMaskWithMaxLen(self, dtype): length = [2, 4, 3] maximum_length = 5 expected = np.array([ [ [1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 0, 0, 0], ], [ [1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0], ], [ [1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0], ], ]).astype(dtype.as_numpy_dtype) mask = transformer.future_mask(tf.constant(length), maximum_length=maximum_length, dtype=dtype) self.assertIs(mask.dtype, dtype) mask = self.evaluate(mask) self.assertTupleEqual(mask.shape, (len(length), maximum_length, maximum_length)) self.assertAllEqual(mask, expected)
def _run( self, inputs, sequence_length=None, cache=None, memory=None, memory_sequence_length=None, step=None, training=None, ): # Process inputs. inputs *= self.num_units**0.5 if self.position_encoder is not None: inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None) inputs = common.dropout(inputs, self.dropout, training=training) # Prepare query mask. mask = None if step is None: maximum_length = tf.shape(inputs)[1] if sequence_length is None: batch_size = tf.shape(inputs)[0] sequence_length = tf.fill([batch_size], maximum_length) mask = transformer.future_mask(sequence_length, maximum_length=maximum_length) # Prepare memory mask. memory_mask = None if memory is not None: if not isinstance(memory, (list, tuple)): memory = (memory, ) if memory_sequence_length is not None: if not isinstance(memory_sequence_length, (list, tuple)): memory_sequence_length = (memory_sequence_length, ) memory_mask = [ tf.sequence_mask(mem_length, maxlen=tf.shape(mem)[1]) for mem, mem_length in zip(memory, memory_sequence_length) ] # Run each layer. new_cache = [] for i, layer in enumerate(self.layers): inputs, layer_cache, attention = layer( inputs, mask=mask, memory=memory, memory_mask=memory_mask, cache=cache[i] if cache is not None else None, training=training, ) new_cache.append(layer_cache) outputs = self.layer_norm(inputs) return outputs, new_cache, attention
def _run( self, inputs, sequence_length=None, cache=None, memory=None, memory_sequence_length=None, step=None, training=None, ): # Process inputs. inputs *= self.num_units ** 0.5 if self.position_encoder is not None: inputs = self.position_encoder( inputs, position=step + 1 if step is not None else None ) inputs = common.dropout(inputs, self.dropout, training=training) # Prepare query mask. mask = None if step is None: maximum_length = tf.shape(inputs)[1] if sequence_length is None: batch_size = tf.shape(inputs)[0] sequence_length = tf.fill([batch_size], maximum_length) mask = transformer.future_mask( sequence_length, maximum_length=maximum_length ) # Prepare memory mask. memory_mask = None if memory is not None: if not isinstance(memory, (list, tuple)): memory = (memory,) if memory_sequence_length is not None: if not isinstance(memory_sequence_length, (list, tuple)): memory_sequence_length = (memory_sequence_length,) memory_mask = [ tf.sequence_mask(mem_length, maxlen=tf.shape(mem)[1]) for mem, mem_length in zip(memory, memory_sequence_length) ] else: memory_mask = tuple(None for _ in memory) # Run each layer. new_cache = [] attention = [] for i, layer in enumerate(self.layers): inputs, layer_cache, layer_attention = layer( inputs, mask=mask, memory=memory, memory_mask=memory_mask, cache=cache[i] if cache is not None else None, training=training, ) attention.append(layer_attention) new_cache.append(layer_cache) outputs = self.layer_norm(inputs) if self.layer_norm is not None else inputs # Convert list of shape num_layers x num_sources to num_sources x num_layers attention = list(map(list, zip(*attention))) if attention: attention = transformer.MultiHeadAttentionReduction.reduce( attention[0], # Get attention to the first source. self.attention_reduction, ) else: attention = None return outputs, new_cache, attention