Python drop_and_add Examples, opennmt.layers.transformer.drop_and_add Python Examples

Example #1

0

Show file

File: model.py Project: iriscxy/VMSMO

    def cross_attention(self,
                        table_encodes,
                        document_encodes,
                        num_units,
                        num_heads,
                        num_layers,
                        ffn_inner_dim,
                        sequence_length=None,
                        mode=tf.estimator.ModeKeys.TRAIN):
        table_encodes *= num_units**0.5
        # if self.position_encoder is not None:
        #     inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(
            table_encodes,
            rate=FLAGS.attention_dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=num_heads,
            maximum_length=tf.shape(document_encodes)[1])

        state = ()

        for l in range(num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(
                        num_heads,
                        transformer.norm(inputs),
                        document_encodes,
                        mode,
                        num_units=num_units,
                        mask=mask,
                        dropout=FLAGS.attention_dropout)
                    context = transformer.drop_and_add(inputs,
                                                       context,
                                                       mode,
                                                       dropout=FLAGS.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        ffn_inner_dim,
                        mode,
                        dropout=FLAGS.attention_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=FLAGS.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        # return (outputs, state, sequence_length)
        return outputs

Example #2

0

Show file

    def encode(self,
               inputs,
               sequence_length=None,
               mode=tf.estimator.ModeKeys.TRAIN):
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)
        else:
            print("===============================================")
            print("no position encoder")

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])

        state = ()

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout)
                    context = transformer.drop_and_add(inputs,
                                                       context,
                                                       mode,
                                                       dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        return (outputs, state, sequence_length)

Example #3

0

Show file

File: self_attention_encoder.py Project: yhgon/OpenNMT-tf

  def encode(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN):
    inputs *= self.num_units**0.5
    if self.position_encoder is not None:
      inputs = self.position_encoder(inputs, sequence_length=sequence_length)

    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)
    mask = transformer.build_sequence_mask(
        sequence_length,
        num_heads=self.num_heads,
        maximum_length=tf.shape(inputs)[1],
        dtype=inputs.dtype)

    state = ()

    for l in range(self.num_layers):
      with tf.variable_scope("layer_{}".format(l)):
        with tf.variable_scope("multi_head"):
          context = transformer.multi_head_attention(
              self.num_heads,
              transformer.norm(inputs),
              None,
              mode,
              num_units=self.num_units,
              mask=mask,
              dropout=self.attention_dropout)
          context = transformer.drop_and_add(
              inputs,
              context,
              mode,
              dropout=self.dropout)

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed
        state += (tf.reduce_mean(inputs, axis=1),)

    outputs = transformer.norm(inputs)
    return (outputs, state, sequence_length)

Example #4

0

Show file

File: self_attention_decoder.py Project: Jeffyangchina/remark

    def _self_attention_stack(self,
                              inputs,
                              sequence_length=None,
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,
                              memory_sequence_length=None):
        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None

        if sequence_length is not None:
            decoder_mask = transformer.build_future_mask(
                sequence_length,
                num_heads=self.num_heads,
                maximum_length=tf.shape(inputs)[1],
                dtype=inputs.dtype)
        if memory is not None:
            if cache is not None:
                memory_mask = cache["memory_mask"]
            elif memory_sequence_length is not None:
                memory_mask = self._build_memory_mask(
                    memory, memory_sequence_length=memory_sequence_length)

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                with tf.variable_scope("masked_multi_head"):
                    encoded = transformer.multi_head_attention(
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=decoder_mask,
                        cache=layer_cache,
                        dropout=self.attention_dropout)
                    encoded = transformer.drop_and_add(inputs,
                                                       encoded,
                                                       mode,
                                                       dropout=self.dropout)

                if memory is not None:
                    with tf.variable_scope("multi_head"):
                        context = transformer.multi_head_attention(
                            self.num_heads,
                            transformer.norm(encoded),
                            memory,
                            mode,
                            mask=memory_mask,
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        context = transformer.drop_and_add(
                            encoded, context, mode, dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed

        outputs = transformer.norm(inputs)
        return outputs

Example #5

0

Show file

File: self_attention_decoder.py Project: troublemarker/OpenNMT-tf

  def _self_attention_stack(self,
                            inputs,
                            sequence_length=None,
                            mode=tf.estimator.ModeKeys.TRAIN,
                            cache=None,
                            memory=None,
                            memory_sequence_length=None,
                            step=None):
    inputs *= self.num_units**0.5
    if self.position_encoder is not None:
      if step is None:
        inputs = self.position_encoder(inputs, sequence_length=sequence_length)
      else:
        inputs = self.position_encoder.apply_one(inputs, step + 1)

    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder_mask = None
    memory_mask = None
    last_attention = None

    if self.self_attention_type == "scaled_dot":
      if sequence_length is not None:
        decoder_mask = transformer.build_future_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])
    elif self.self_attention_type == "average":
      if cache is None:
        if sequence_length is None:
          sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
        decoder_mask = transformer.cumulative_average_mask(
            sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

    if memory is not None and memory_sequence_length is not None:
      memory_mask = transformer.build_sequence_mask(
          memory_sequence_length,
          num_heads=self.num_heads,
          maximum_length=tf.shape(memory)[1])

    for l in range(self.num_layers):
      layer_name = "layer_{}".format(l)
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):
        if self.self_attention_type == "scaled_dot":
          with tf.variable_scope("masked_multi_head"):
            encoded = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(inputs),
                None,
                mode,
                num_units=self.num_units,
                mask=decoder_mask,
                cache=layer_cache,
                dropout=self.attention_dropout)
            encoded = transformer.drop_and_add(
                inputs,
                encoded,
                mode,
                dropout=self.dropout)
        elif self.self_attention_type == "average":
          with tf.variable_scope("average_attention"):
            # Cumulative average.
            x = transformer.norm(inputs)
            y = transformer.cumulative_average(
                x, decoder_mask if cache is None else step, cache=layer_cache)
            # FFN.
            y = transformer.feed_forward(
                y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
            # Gating layer.
            z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
            i, f = tf.split(z, 2, axis=-1)
            y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
            encoded = transformer.drop_and_add(
                inputs, y, mode, dropout=self.dropout)

        if memory is not None:
          with tf.variable_scope("multi_head"):
            context, last_attention = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(encoded),
                memory,
                mode,
                mask=memory_mask,
                cache=layer_cache,
                dropout=self.attention_dropout,
                return_attention=True)
            context = transformer.drop_and_add(
                encoded,
                context,
                mode,
                dropout=self.dropout)
        else:
          context = encoded

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed

    if last_attention is not None:
      # The first head of the last layer is returned.
      first_head_attention = last_attention[:, 0]
    else:
      first_head_attention = None

    outputs = transformer.norm(inputs)
    return outputs, first_head_attention

Example #6

0

Show file

File: self_attention_decoder.py Project: yhgon/OpenNMT-tf

  def _self_attention_stack(self,
                            inputs,
                            sequence_length=None,
                            mode=tf.estimator.ModeKeys.TRAIN,
                            cache=None,
                            memory=None,
                            memory_sequence_length=None):
    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder_mask = None
    memory_mask = None

    if sequence_length is not None:
      decoder_mask = transformer.build_future_mask(
          sequence_length,
          num_heads=self.num_heads,
          maximum_length=tf.shape(inputs)[1],
          dtype=inputs.dtype)
    if memory is not None:
      if cache is not None:
        memory_mask = cache["memory_mask"]
      elif memory_sequence_length is not None:
        memory_mask = self._build_memory_mask(
            memory, memory_sequence_length=memory_sequence_length)

    for l in range(self.num_layers):
      layer_name = "layer_{}".format(l)
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):
        with tf.variable_scope("masked_multi_head"):
          encoded = transformer.multi_head_attention(
              self.num_heads,
              transformer.norm(inputs),
              None,
              mode,
              num_units=self.num_units,
              mask=decoder_mask,
              cache=layer_cache,
              dropout=self.attention_dropout)
          encoded = transformer.drop_and_add(
              inputs,
              encoded,
              mode,
              dropout=self.dropout)

        if memory is not None:
          with tf.variable_scope("multi_head"):
            context = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(encoded),
                memory,
                mode,
                mask=memory_mask,
                cache=layer_cache,
                dropout=self.attention_dropout)
            context = transformer.drop_and_add(
                encoded,
                context,
                mode,
                dropout=self.dropout)

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed

    outputs = transformer.norm(inputs)
    return outputs

Example #7

0

Show file

File: self_attention_decoder.py Project: iriscxy/VMSMO

    def _self_attention_stack(self,
                              inputs,  # batch, max_dec_len, emb_dim
                              sequence_length=None,  # [batch]
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,  # [batch, enc_len, num_units]
                              memory_sequence_length=None,  # [batch]
                              step=None):
        inputs *= self.num_units ** 0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None)
            # inputs [batch, max_dec_len, emb_dim]
        inputs = tf.layers.dropout(  # batch, max_dec_len, emb_dim
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None
        last_attention = None

        if self.self_attention_type == "scaled_dot":
            if sequence_length is not None:  # sequence_length is None when decode, not None at train
                decoder_mask = transformer.build_future_mask(  # [batch, 1, max_dec_len, max_dec_len]
                    sequence_length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(inputs)[1])
        elif self.self_attention_type == "average":
            if cache is None:
                if sequence_length is None:
                    sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
                decoder_mask = transformer.cumulative_average_mask(
                    sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

        if memory is not None and not tf.contrib.framework.nest.is_sequence(memory):
            memory = (memory,)
        if memory_sequence_length is not None:
            if not tf.contrib.framework.nest.is_sequence(memory_sequence_length):
                memory_sequence_length = (memory_sequence_length,)
            memory_mask = [  # [batch, 1, 1, enc_len]
                transformer.build_sequence_mask(
                    length, num_heads=self.num_heads, maximum_length=tf.shape(m)[1])
                for m, length in zip(memory, memory_sequence_length)]

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None  # train的时候没有cache，decode的时候有cache
            with tf.variable_scope(layer_name):
                # self attention encode the decoder input (training) or last step output (decode)
                if self.self_attention_type == "scaled_dot":
                    with tf.variable_scope("masked_multi_head"):
                        encoded = transformer.multi_head_attention(  # [batch, decode_len, hidden]
                            self.num_heads,
                            transformer.norm(inputs),
                            None,
                            mode,
                            num_units=self.num_units,
                            mask=decoder_mask,  # [batch, 1, len, len]
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        last_context = transformer.drop_and_add(  # [batch, decode_len, hidden]
                            inputs,
                            encoded,
                            mode,
                            dropout=self.dropout)
                elif self.self_attention_type == "average":
                    with tf.variable_scope("average_attention"):
                        # Cumulative average.
                        x = transformer.norm(inputs)
                        y = transformer.cumulative_average(
                            x, decoder_mask if cache is None else step, cache=layer_cache)
                        # FFN.
                        y = transformer.feed_forward(
                            y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
                        # Gating layer.
                        z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
                        i, f = tf.split(z, 2, axis=-1)
                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
                        last_context = transformer.drop_and_add(
                            inputs, y, mode, dropout=self.dropout)

                # attending to encoder memory using decoder context
                if memory is not None:
                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
                        memory_cache = layer_cache["memory"][
                            i] if layer_cache is not None else None  # train的时候没有cache，decode的时候有cache
                        with tf.variable_scope("multi_head" if i == 0 else "multi_head_%d" % i):
                            context, last_attention = transformer.multi_head_attention(
                                self.num_heads,
                                transformer.norm(last_context),
                                mem,  # [batch, enc_len, dim]
                                mode,
                                mask=mask,  # [batch, 1, 1, len]
                                cache=memory_cache,
                                dropout=self.attention_dropout,
                                return_attention=True)
                            # context [batch, decode_len, num_units], last_attention train[batch, head, dec_len, enc_len]
                            last_context = transformer.drop_and_add(
                                last_context,  # [batch, decode_len, num_units]
                                context,
                                mode,
                                dropout=self.dropout)
                            if i > 0:  # Do not return attention in case of multi source.
                                last_attention = None

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(  # [batch, decode_len, num_units]
                        transformer.norm(last_context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(  # [batch, decode_len, num_units]
                        last_context,
                        transformed,
                        mode,
                        dropout=self.dropout)

                inputs = transformed

        if last_attention is not None:
            # The first head of the last layer is returned.
            first_head_attention = last_attention[:, 0]
        else:
            first_head_attention = None

        outputs = transformer.norm(inputs)  # [batch, decode_len, num_units]
        return outputs, first_head_attention

Example #8

0

Show file

    def encode(self,
               inputs,
               sequence_length=None,
               mode=tf.estimator.ModeKeys.TRAIN):
        """

        :param inputs: [batch, enc_len, emb_dim]
        :param sequence_length: [batch]
        :param mode:
        :return: outputs: [batch, len, dim] last layer output
        state: a tuple ([batch, dim]) * num_layers, contains the sum over len of each layer outputs
        sequence_length [batch]
        """
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(  # [batch, 1, 1, enc_len]
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])

        state = ()

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(  # [batch, len, dim]
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout)
                    context = transformer.drop_and_add(  # [batch, len, dim]
                        inputs,
                        context,
                        mode,
                        dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(  # [batch, len, dim]
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(  # [batch, len, dim]
                        context,
                        transformed,
                        mode,
                        dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)  # [batch, len, dim]
        return (outputs, state, sequence_length)