Example #1
0
    def testCumulativeAverage(self):
        x = [[[1.0], [2.0], [3.0], [0.0]], [[2.0], [4.0], [6.0], [8.0]]]
        y = [[[1.0], [(1.0 + 2.0) / 2.0], [(1.0 + 2.0 + 3.0) / 3.0], [0.0]],
             [[2.0], [(2.0 + 4.0) / 2.0], [(2.0 + 4.0 + 6.0) / 3.0],
              [(2.0 + 4.0 + 6.0 + 8.0) / 4.0]]]
        lengths = [3, 4]

        mask = transformer.cumulative_average_mask(tf.constant(lengths))
        aa = transformer.cumulative_average(x, mask)
        aa = self.evaluate(aa)
        self.assertAllClose(y, aa)
Example #2
0
 def _body(i, accu, cache):
     aa = transformer.cumulative_average(x[:, i:i + 1], i, cache)
     return i + 1, accu.write(i, tf.squeeze(aa, axis=1)), cache
  def _self_attention_stack(self,
                            inputs,
                            sequence_length=None,
                            mode=tf.estimator.ModeKeys.TRAIN,
                            cache=None,
                            memory=None,
                            memory_sequence_length=None,
                            step=None):
    inputs *= self.num_units**0.5
    if self.position_encoder is not None:
      if step is None:
        inputs = self.position_encoder(inputs, sequence_length=sequence_length)
      else:
        inputs = self.position_encoder.apply_one(inputs, step + 1)

    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder_mask = None
    memory_mask = None
    last_attention = None

    if self.self_attention_type == "scaled_dot":
      if sequence_length is not None:
        decoder_mask = transformer.build_future_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])
    elif self.self_attention_type == "average":
      if cache is None:
        if sequence_length is None:
          sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
        decoder_mask = transformer.cumulative_average_mask(
            sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

    if memory is not None and memory_sequence_length is not None:
      memory_mask = transformer.build_sequence_mask(
          memory_sequence_length,
          num_heads=self.num_heads,
          maximum_length=tf.shape(memory)[1])

    for l in range(self.num_layers):
      layer_name = "layer_{}".format(l)
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):
        if self.self_attention_type == "scaled_dot":
          with tf.variable_scope("masked_multi_head"):
            encoded = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(inputs),
                None,
                mode,
                num_units=self.num_units,
                mask=decoder_mask,
                cache=layer_cache,
                dropout=self.attention_dropout)
            encoded = transformer.drop_and_add(
                inputs,
                encoded,
                mode,
                dropout=self.dropout)
        elif self.self_attention_type == "average":
          with tf.variable_scope("average_attention"):
            # Cumulative average.
            x = transformer.norm(inputs)
            y = transformer.cumulative_average(
                x, decoder_mask if cache is None else step, cache=layer_cache)
            # FFN.
            y = transformer.feed_forward(
                y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
            # Gating layer.
            z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
            i, f = tf.split(z, 2, axis=-1)
            y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
            encoded = transformer.drop_and_add(
                inputs, y, mode, dropout=self.dropout)

        if memory is not None:
          with tf.variable_scope("multi_head"):
            context, last_attention = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(encoded),
                memory,
                mode,
                mask=memory_mask,
                cache=layer_cache,
                dropout=self.attention_dropout,
                return_attention=True)
            context = transformer.drop_and_add(
                encoded,
                context,
                mode,
                dropout=self.dropout)
        else:
          context = encoded

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed

    if last_attention is not None:
      # The first head of the last layer is returned.
      first_head_attention = last_attention[:, 0]
    else:
      first_head_attention = None

    outputs = transformer.norm(inputs)
    return outputs, first_head_attention
Example #4
0
    def _self_attention_stack(self,
                              inputs,  # batch, max_dec_len, emb_dim
                              sequence_length=None,  # [batch]
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,  # [batch, enc_len, num_units]
                              memory_sequence_length=None,  # [batch]
                              step=None):
        inputs *= self.num_units ** 0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None)
            # inputs [batch, max_dec_len, emb_dim]
        inputs = tf.layers.dropout(  # batch, max_dec_len, emb_dim
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None
        last_attention = None

        if self.self_attention_type == "scaled_dot":
            if sequence_length is not None:  # sequence_length is None when decode, not None at train
                decoder_mask = transformer.build_future_mask(  # [batch, 1, max_dec_len, max_dec_len]
                    sequence_length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(inputs)[1])
        elif self.self_attention_type == "average":
            if cache is None:
                if sequence_length is None:
                    sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
                decoder_mask = transformer.cumulative_average_mask(
                    sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

        if memory is not None and not tf.contrib.framework.nest.is_sequence(memory):
            memory = (memory,)
        if memory_sequence_length is not None:
            if not tf.contrib.framework.nest.is_sequence(memory_sequence_length):
                memory_sequence_length = (memory_sequence_length,)
            memory_mask = [  # [batch, 1, 1, enc_len]
                transformer.build_sequence_mask(
                    length, num_heads=self.num_heads, maximum_length=tf.shape(m)[1])
                for m, length in zip(memory, memory_sequence_length)]

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None  # train的时候没有cache,decode的时候有cache
            with tf.variable_scope(layer_name):
                # self attention encode the decoder input (training) or last step output (decode)
                if self.self_attention_type == "scaled_dot":
                    with tf.variable_scope("masked_multi_head"):
                        encoded = transformer.multi_head_attention(  # [batch, decode_len, hidden]
                            self.num_heads,
                            transformer.norm(inputs),
                            None,
                            mode,
                            num_units=self.num_units,
                            mask=decoder_mask,  # [batch, 1, len, len]
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        last_context = transformer.drop_and_add(  # [batch, decode_len, hidden]
                            inputs,
                            encoded,
                            mode,
                            dropout=self.dropout)
                elif self.self_attention_type == "average":
                    with tf.variable_scope("average_attention"):
                        # Cumulative average.
                        x = transformer.norm(inputs)
                        y = transformer.cumulative_average(
                            x, decoder_mask if cache is None else step, cache=layer_cache)
                        # FFN.
                        y = transformer.feed_forward(
                            y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
                        # Gating layer.
                        z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
                        i, f = tf.split(z, 2, axis=-1)
                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
                        last_context = transformer.drop_and_add(
                            inputs, y, mode, dropout=self.dropout)

                # attending to encoder memory using decoder context
                if memory is not None:
                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
                        memory_cache = layer_cache["memory"][
                            i] if layer_cache is not None else None  # train的时候没有cache,decode的时候有cache
                        with tf.variable_scope("multi_head" if i == 0 else "multi_head_%d" % i):
                            context, last_attention = transformer.multi_head_attention(
                                self.num_heads,
                                transformer.norm(last_context),
                                mem,  # [batch, enc_len, dim]
                                mode,
                                mask=mask,  # [batch, 1, 1, len]
                                cache=memory_cache,
                                dropout=self.attention_dropout,
                                return_attention=True)
                            # context [batch, decode_len, num_units], last_attention train[batch, head, dec_len, enc_len]
                            last_context = transformer.drop_and_add(
                                last_context,  # [batch, decode_len, num_units]
                                context,
                                mode,
                                dropout=self.dropout)
                            if i > 0:  # Do not return attention in case of multi source.
                                last_attention = None

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(  # [batch, decode_len, num_units]
                        transformer.norm(last_context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(  # [batch, decode_len, num_units]
                        last_context,
                        transformed,
                        mode,
                        dropout=self.dropout)

                inputs = transformed

        if last_attention is not None:
            # The first head of the last layer is returned.
            first_head_attention = last_attention[:, 0]
        else:
            first_head_attention = None

        outputs = transformer.norm(inputs)  # [batch, decode_len, num_units]
        return outputs, first_head_attention