Esempio n. 1
0
  def testScaledDotAttention(self):
    batch_size = 3
    num_heads = 8
    values_length = [5, 3, 7]
    queries_length = [8, 6, 10]
    depth = 20

    queries = tf.placeholder_with_default(
        np.random.randn(batch_size, num_heads, max(queries_length), depth).astype(np.float32),
        shape=(None, num_heads, None, depth))
    values = tf.placeholder_with_default(
        np.random.randn(batch_size, num_heads, max(values_length), depth).astype(np.float32),
        shape=(None, num_heads, None, depth))
    keys = values

    mask = transformer.build_sequence_mask(values_length, num_heads=num_heads)
    context, attn = transformer.dot_product_attention(
        queries,
        keys,
        values,
        tf.estimator.ModeKeys.PREDICT,
        mask=mask)

    with self.test_session() as sess:
      context, attn = sess.run([context, attn])
      self.assertTupleEqual(
          (batch_size, num_heads, max(queries_length), depth), context.shape)
      self.assertTupleEqual(
          (batch_size, num_heads, max(queries_length), max(values_length)), attn.shape)

      for i in range(batch_size):
        length = values_length[i]
        padding_length = max(values_length) - length
        if padding_length > 0:
          self.assertEqual(0.0, np.sum(attn[i, :, :, length:max(values_length)]))
Esempio n. 2
0
 def _build_memory_mask(self, memory, memory_sequence_length=None):
     if memory_sequence_length is None:
         return None
     else:
         return transformer.build_sequence_mask(
             memory_sequence_length,
             num_heads=self.num_heads,
             maximum_length=tf.shape(memory)[1])
Esempio n. 3
0
 def _build_memory_mask(self, memory, memory_sequence_length=None):
   if memory_sequence_length is None:
     return None
   else:
     return transformer.build_sequence_mask(
         memory_sequence_length,
         num_heads=self.num_heads,
         maximum_length=tf.shape(memory)[1],
         dtype=memory.dtype)
Esempio n. 4
0
    def cross_attention(self,
                        table_encodes,
                        document_encodes,
                        num_units,
                        num_heads,
                        num_layers,
                        ffn_inner_dim,
                        sequence_length=None,
                        mode=tf.estimator.ModeKeys.TRAIN):
        table_encodes *= num_units**0.5
        # if self.position_encoder is not None:
        #     inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(
            table_encodes,
            rate=FLAGS.attention_dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=num_heads,
            maximum_length=tf.shape(document_encodes)[1])

        state = ()

        for l in range(num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(
                        num_heads,
                        transformer.norm(inputs),
                        document_encodes,
                        mode,
                        num_units=num_units,
                        mask=mask,
                        dropout=FLAGS.attention_dropout)
                    context = transformer.drop_and_add(inputs,
                                                       context,
                                                       mode,
                                                       dropout=FLAGS.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        ffn_inner_dim,
                        mode,
                        dropout=FLAGS.attention_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=FLAGS.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        # return (outputs, state, sequence_length)
        return outputs
Esempio n. 5
0
    def testBuildSequenceMask(self):
        num_heads = 4
        length = [5, 3, 7]
        expected = [[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
                    [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]

        mask = transformer.build_sequence_mask(tf.constant(length),
                                               num_heads=num_heads)
        mask = self.evaluate(mask)
        self.assertTupleEqual(mask.shape, (len(length), 1, 1, max(length)))
        self.assertAllEqual(np.squeeze(mask), expected)
Esempio n. 6
0
    def encode(self,
               inputs,
               sequence_length=None,
               mode=tf.estimator.ModeKeys.TRAIN):
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)
        else:
            print("===============================================")
            print("no position encoder")

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])

        state = ()

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout)
                    context = transformer.drop_and_add(inputs,
                                                       context,
                                                       mode,
                                                       dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        return (outputs, state, sequence_length)
Esempio n. 7
0
  def encode(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN):
    inputs *= self.num_units**0.5
    if self.position_encoder is not None:
      inputs = self.position_encoder(inputs, sequence_length=sequence_length)

    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)
    mask = transformer.build_sequence_mask(
        sequence_length,
        num_heads=self.num_heads,
        maximum_length=tf.shape(inputs)[1],
        dtype=inputs.dtype)

    state = ()

    for l in range(self.num_layers):
      with tf.variable_scope("layer_{}".format(l)):
        with tf.variable_scope("multi_head"):
          context = transformer.multi_head_attention(
              self.num_heads,
              transformer.norm(inputs),
              None,
              mode,
              num_units=self.num_units,
              mask=mask,
              dropout=self.attention_dropout)
          context = transformer.drop_and_add(
              inputs,
              context,
              mode,
              dropout=self.dropout)

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed
        state += (tf.reduce_mean(inputs, axis=1),)

    outputs = transformer.norm(inputs)
    return (outputs, state, sequence_length)
Esempio n. 8
0
  def testBuildSequenceMask(self):
    num_heads = 4
    length = [5, 3, 7]
    expected = [
        [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
        [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
        [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]

    mask = transformer.build_sequence_mask(tf.constant(length), num_heads=num_heads)

    with self.test_session() as sess:
      mask = sess.run(mask)
      mask = np.reshape(mask, (len(length), num_heads, max(length)))
      mask = np.transpose(mask, (1, 0, 2))
      for b in range(len(length)):
        self.assertAllEqual(expected, mask[b])
Esempio n. 9
0
    def testBuildSequenceMaskWithMaxLen(self):
        num_heads = 4
        length = [5, 3, 6]
        maximum_length = 7
        expected = [[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
                    [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]]

        mask = transformer.build_sequence_mask(tf.constant(length),
                                               num_heads=num_heads,
                                               maximum_length=maximum_length)

        with self.test_session() as sess:
            mask = sess.run(mask)
            self.assertTupleEqual(mask.shape,
                                  (len(length), 1, 1, maximum_length))
            self.assertAllEqual(np.squeeze(mask), expected)
Esempio n. 10
0
 def __init__(self,
              mode,
              num_layers,
              num_units,
              num_heads,
              memory,
              memory_sequence_length,
              cell_class=tf.contrib.rnn.LayerNormBasicLSTMCell,
              dropout=0.3):
     super(_RNMTPlusDecoderCell, self).__init__()
     self._mode = mode
     self._num_units = num_units
     self._num_heads = num_heads
     self._dropout = dropout
     self._cells = [cell_class(num_units) for _ in range(num_layers)]
     self._memory = memory
     self._memory_mask = build_sequence_mask(
         memory_sequence_length,
         num_heads=self._num_heads,
         maximum_length=tf.shape(memory)[1])
  def _self_attention_stack(self,
                            inputs,
                            sequence_length=None,
                            mode=tf.estimator.ModeKeys.TRAIN,
                            cache=None,
                            memory=None,
                            memory_sequence_length=None,
                            step=None):
    inputs *= self.num_units**0.5
    if self.position_encoder is not None:
      if step is None:
        inputs = self.position_encoder(inputs, sequence_length=sequence_length)
      else:
        inputs = self.position_encoder.apply_one(inputs, step + 1)

    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder_mask = None
    memory_mask = None
    last_attention = None

    if self.self_attention_type == "scaled_dot":
      if sequence_length is not None:
        decoder_mask = transformer.build_future_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])
    elif self.self_attention_type == "average":
      if cache is None:
        if sequence_length is None:
          sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
        decoder_mask = transformer.cumulative_average_mask(
            sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

    if memory is not None and memory_sequence_length is not None:
      memory_mask = transformer.build_sequence_mask(
          memory_sequence_length,
          num_heads=self.num_heads,
          maximum_length=tf.shape(memory)[1])

    for l in range(self.num_layers):
      layer_name = "layer_{}".format(l)
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):
        if self.self_attention_type == "scaled_dot":
          with tf.variable_scope("masked_multi_head"):
            encoded = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(inputs),
                None,
                mode,
                num_units=self.num_units,
                mask=decoder_mask,
                cache=layer_cache,
                dropout=self.attention_dropout)
            encoded = transformer.drop_and_add(
                inputs,
                encoded,
                mode,
                dropout=self.dropout)
        elif self.self_attention_type == "average":
          with tf.variable_scope("average_attention"):
            # Cumulative average.
            x = transformer.norm(inputs)
            y = transformer.cumulative_average(
                x, decoder_mask if cache is None else step, cache=layer_cache)
            # FFN.
            y = transformer.feed_forward(
                y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
            # Gating layer.
            z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
            i, f = tf.split(z, 2, axis=-1)
            y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
            encoded = transformer.drop_and_add(
                inputs, y, mode, dropout=self.dropout)

        if memory is not None:
          with tf.variable_scope("multi_head"):
            context, last_attention = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(encoded),
                memory,
                mode,
                mask=memory_mask,
                cache=layer_cache,
                dropout=self.attention_dropout,
                return_attention=True)
            context = transformer.drop_and_add(
                encoded,
                context,
                mode,
                dropout=self.dropout)
        else:
          context = encoded

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed

    if last_attention is not None:
      # The first head of the last layer is returned.
      first_head_attention = last_attention[:, 0]
    else:
      first_head_attention = None

    outputs = transformer.norm(inputs)
    return outputs, first_head_attention
Esempio n. 12
0
    def _self_attention_stack(self,
                              inputs,  # batch, max_dec_len, emb_dim
                              sequence_length=None,  # [batch]
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,  # [batch, enc_len, num_units]
                              memory_sequence_length=None,  # [batch]
                              step=None):
        inputs *= self.num_units ** 0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None)
            # inputs [batch, max_dec_len, emb_dim]
        inputs = tf.layers.dropout(  # batch, max_dec_len, emb_dim
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None
        last_attention = None

        if self.self_attention_type == "scaled_dot":
            if sequence_length is not None:  # sequence_length is None when decode, not None at train
                decoder_mask = transformer.build_future_mask(  # [batch, 1, max_dec_len, max_dec_len]
                    sequence_length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(inputs)[1])
        elif self.self_attention_type == "average":
            if cache is None:
                if sequence_length is None:
                    sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
                decoder_mask = transformer.cumulative_average_mask(
                    sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

        if memory is not None and not tf.contrib.framework.nest.is_sequence(memory):
            memory = (memory,)
        if memory_sequence_length is not None:
            if not tf.contrib.framework.nest.is_sequence(memory_sequence_length):
                memory_sequence_length = (memory_sequence_length,)
            memory_mask = [  # [batch, 1, 1, enc_len]
                transformer.build_sequence_mask(
                    length, num_heads=self.num_heads, maximum_length=tf.shape(m)[1])
                for m, length in zip(memory, memory_sequence_length)]

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None  # train的时候没有cache,decode的时候有cache
            with tf.variable_scope(layer_name):
                # self attention encode the decoder input (training) or last step output (decode)
                if self.self_attention_type == "scaled_dot":
                    with tf.variable_scope("masked_multi_head"):
                        encoded = transformer.multi_head_attention(  # [batch, decode_len, hidden]
                            self.num_heads,
                            transformer.norm(inputs),
                            None,
                            mode,
                            num_units=self.num_units,
                            mask=decoder_mask,  # [batch, 1, len, len]
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        last_context = transformer.drop_and_add(  # [batch, decode_len, hidden]
                            inputs,
                            encoded,
                            mode,
                            dropout=self.dropout)
                elif self.self_attention_type == "average":
                    with tf.variable_scope("average_attention"):
                        # Cumulative average.
                        x = transformer.norm(inputs)
                        y = transformer.cumulative_average(
                            x, decoder_mask if cache is None else step, cache=layer_cache)
                        # FFN.
                        y = transformer.feed_forward(
                            y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
                        # Gating layer.
                        z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
                        i, f = tf.split(z, 2, axis=-1)
                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
                        last_context = transformer.drop_and_add(
                            inputs, y, mode, dropout=self.dropout)

                # attending to encoder memory using decoder context
                if memory is not None:
                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
                        memory_cache = layer_cache["memory"][
                            i] if layer_cache is not None else None  # train的时候没有cache,decode的时候有cache
                        with tf.variable_scope("multi_head" if i == 0 else "multi_head_%d" % i):
                            context, last_attention = transformer.multi_head_attention(
                                self.num_heads,
                                transformer.norm(last_context),
                                mem,  # [batch, enc_len, dim]
                                mode,
                                mask=mask,  # [batch, 1, 1, len]
                                cache=memory_cache,
                                dropout=self.attention_dropout,
                                return_attention=True)
                            # context [batch, decode_len, num_units], last_attention train[batch, head, dec_len, enc_len]
                            last_context = transformer.drop_and_add(
                                last_context,  # [batch, decode_len, num_units]
                                context,
                                mode,
                                dropout=self.dropout)
                            if i > 0:  # Do not return attention in case of multi source.
                                last_attention = None

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(  # [batch, decode_len, num_units]
                        transformer.norm(last_context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(  # [batch, decode_len, num_units]
                        last_context,
                        transformed,
                        mode,
                        dropout=self.dropout)

                inputs = transformed

        if last_attention is not None:
            # The first head of the last layer is returned.
            first_head_attention = last_attention[:, 0]
        else:
            first_head_attention = None

        outputs = transformer.norm(inputs)  # [batch, decode_len, num_units]
        return outputs, first_head_attention
Esempio n. 13
0
    def encode(self,
               inputs,
               sequence_length=None,
               mode=tf.estimator.ModeKeys.TRAIN):
        """

        :param inputs: [batch, enc_len, emb_dim]
        :param sequence_length: [batch]
        :param mode:
        :return: outputs: [batch, len, dim] last layer output
        state: a tuple ([batch, dim]) * num_layers, contains the sum over len of each layer outputs
        sequence_length [batch]
        """
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(  # [batch, 1, 1, enc_len]
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])

        state = ()

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(  # [batch, len, dim]
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout)
                    context = transformer.drop_and_add(  # [batch, len, dim]
                        inputs,
                        context,
                        mode,
                        dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(  # [batch, len, dim]
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(  # [batch, len, dim]
                        context,
                        transformed,
                        mode,
                        dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)  # [batch, len, dim]
        return (outputs, state, sequence_length)