Example #1
0
    def testBuildFutureMaskWithMaxLen(self, dtype):
        length = [2, 4, 3]
        maximum_length = 5
        expected = np.array([
            [
                [1, 0, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [1, 1, 0, 0, 0],
            ],
            [
                [1, 0, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [1, 1, 1, 0, 0],
                [1, 1, 1, 1, 0],
                [1, 1, 1, 1, 0],
            ],
            [
                [1, 0, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [1, 1, 1, 0, 0],
                [1, 1, 1, 0, 0],
                [1, 1, 1, 0, 0],
            ],
        ]).astype(dtype.as_numpy_dtype)

        mask = transformer.future_mask(tf.constant(length),
                                       maximum_length=maximum_length,
                                       dtype=dtype)
        self.assertIs(mask.dtype, dtype)
        mask = self.evaluate(mask)
        self.assertTupleEqual(mask.shape,
                              (len(length), maximum_length, maximum_length))
        self.assertAllEqual(mask, expected)
Example #2
0
    def _run(
        self,
        inputs,
        sequence_length=None,
        cache=None,
        memory=None,
        memory_sequence_length=None,
        step=None,
        training=None,
    ):
        # Process inputs.
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs,
                                           position=step +
                                           1 if step is not None else None)
        inputs = common.dropout(inputs, self.dropout, training=training)

        # Prepare query mask.
        mask = None
        if step is None:
            maximum_length = tf.shape(inputs)[1]
            if sequence_length is None:
                batch_size = tf.shape(inputs)[0]
                sequence_length = tf.fill([batch_size], maximum_length)
            mask = transformer.future_mask(sequence_length,
                                           maximum_length=maximum_length)

        # Prepare memory mask.
        memory_mask = None
        if memory is not None:
            if not isinstance(memory, (list, tuple)):
                memory = (memory, )
        if memory_sequence_length is not None:
            if not isinstance(memory_sequence_length, (list, tuple)):
                memory_sequence_length = (memory_sequence_length, )
            memory_mask = [
                tf.sequence_mask(mem_length, maxlen=tf.shape(mem)[1])
                for mem, mem_length in zip(memory, memory_sequence_length)
            ]

        # Run each layer.
        new_cache = []
        for i, layer in enumerate(self.layers):
            inputs, layer_cache, attention = layer(
                inputs,
                mask=mask,
                memory=memory,
                memory_mask=memory_mask,
                cache=cache[i] if cache is not None else None,
                training=training,
            )
            new_cache.append(layer_cache)
        outputs = self.layer_norm(inputs)
        return outputs, new_cache, attention
Example #3
0
    def _run(
        self,
        inputs,
        sequence_length=None,
        cache=None,
        memory=None,
        memory_sequence_length=None,
        step=None,
        training=None,
    ):
        # Process inputs.
        inputs *= self.num_units ** 0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(
                inputs, position=step + 1 if step is not None else None
            )
        inputs = common.dropout(inputs, self.dropout, training=training)

        # Prepare query mask.
        mask = None
        if step is None:
            maximum_length = tf.shape(inputs)[1]
            if sequence_length is None:
                batch_size = tf.shape(inputs)[0]
                sequence_length = tf.fill([batch_size], maximum_length)
            mask = transformer.future_mask(
                sequence_length, maximum_length=maximum_length
            )

        # Prepare memory mask.
        memory_mask = None
        if memory is not None:
            if not isinstance(memory, (list, tuple)):
                memory = (memory,)
            if memory_sequence_length is not None:
                if not isinstance(memory_sequence_length, (list, tuple)):
                    memory_sequence_length = (memory_sequence_length,)
                memory_mask = [
                    tf.sequence_mask(mem_length, maxlen=tf.shape(mem)[1])
                    for mem, mem_length in zip(memory, memory_sequence_length)
                ]
            else:
                memory_mask = tuple(None for _ in memory)

        # Run each layer.
        new_cache = []
        attention = []
        for i, layer in enumerate(self.layers):
            inputs, layer_cache, layer_attention = layer(
                inputs,
                mask=mask,
                memory=memory,
                memory_mask=memory_mask,
                cache=cache[i] if cache is not None else None,
                training=training,
            )
            attention.append(layer_attention)
            new_cache.append(layer_cache)
        outputs = self.layer_norm(inputs) if self.layer_norm is not None else inputs

        # Convert list of shape num_layers x num_sources to num_sources x num_layers
        attention = list(map(list, zip(*attention)))
        if attention:
            attention = transformer.MultiHeadAttentionReduction.reduce(
                attention[0],  # Get attention to the first source.
                self.attention_reduction,
            )
        else:
            attention = None

        return outputs, new_cache, attention