Example #1
0
    def __init__(self,
                 source_inputter,
                 target_inputter,
                 num_layers,
                 num_units,
                 num_heads,
                 ffn_inner_dim,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 position_encoder=SinusoidalPositionEncoder(),
                 decoder_self_attention_type="scaled_dot",
                 share_embeddings=EmbeddingsSharingLevel.NONE,
                 share_encoders=False,
                 alignment_file_key="train_alignments",
                 name="transformer"):
        """Initializes a Transformer model.

    Args:
      source_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process
        the source data. If this inputter returns parallel inputs, a multi
        source Transformer architecture will be constructed.
      target_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process
        the target data. Currently, only the
        :class:`opennmt.inputters.text_inputter.WordEmbedder` is supported.
      num_layers: The shared number of layers.
      num_units: The number of hidden units.
      num_heads: The number of heads in each self-attention layers.
      ffn_inner_dim: The inner dimension of the feed forward layers.
      dropout: The probability to drop units in each layer output.
      attention_dropout: The probability to drop units from the attention.
      relu_dropout: The probability to drop units from the ReLU activation in
        the feed forward layer.
      position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
        apply on the inputs.
      decoder_self_attention_type: Type of self attention in the decoder,
        "scaled_dot" or "average" (case insensitive).
      share_embeddings: Level of embeddings sharing, see
        :class:`opennmt.models.sequence_to_sequence.EmbeddingsSharingLevel`
        for possible values.
      share_encoders: In case of multi source architecture, whether to share the
        separate encoders parameters or not.
      alignment_file_key: The data configuration key of the training alignment
        file to support guided alignment.
      name: The name of this model.
    """
        encoders = [
            SelfAttentionEncoder(num_layers,
                                 num_units=num_units,
                                 num_heads=num_heads,
                                 ffn_inner_dim=ffn_inner_dim,
                                 dropout=dropout,
                                 attention_dropout=attention_dropout,
                                 relu_dropout=relu_dropout,
                                 position_encoder=position_encoder)
            for _ in range(source_inputter.num_outputs)
        ]
        if len(encoders) > 1:
            encoder = ParallelEncoder(encoders,
                                      outputs_reducer=None,
                                      states_reducer=None,
                                      share_parameters=share_encoders)
        else:
            encoder = encoders[0]
        decoder = SelfAttentionDecoder(
            num_layers,
            num_units=num_units,
            num_heads=num_heads,
            ffn_inner_dim=ffn_inner_dim,
            dropout=dropout,
            attention_dropout=attention_dropout,
            relu_dropout=relu_dropout,
            position_encoder=position_encoder,
            self_attention_type=decoder_self_attention_type)

        self._num_units = num_units
        super(Transformer,
              self).__init__(source_inputter,
                             target_inputter,
                             encoder,
                             decoder,
                             share_embeddings=share_embeddings,
                             alignment_file_key=alignment_file_key,
                             daisy_chain_variables=True,
                             name=name)
Example #2
0
    def __init__(self,
                 source_inputter,
                 target_inputter,
                 num_layers,
                 num_units,
                 num_heads,
                 ffn_inner_dim,
                 dropout=0.1,
                 attention_dropout=0.1,
                 ffn_dropout=0.1,
                 ffn_activation=tf.nn.relu,
                 position_encoder_class=SinusoidalPositionEncoder,
                 share_embeddings=EmbeddingsSharingLevel.NONE,
                 share_encoders=False,
                 maximum_relative_position=None):
        """Initializes a Transformer model.

    Args:
      source_inputter: A :class:`opennmt.inputters.Inputter` to process
        the source data. If this inputter returns parallel inputs, a multi
        source Transformer architecture will be constructed.
      target_inputter: A :class:`opennmt.inputters.Inputter` to process
        the target data. Currently, only the
        :class:`opennmt.inputters.WordEmbedder` is supported.
      num_layers: The number of layers or a 2-tuple with the number of encoder
        layers and decoder layers.
      num_units: The number of hidden units.
      num_heads: The number of heads in each self-attention layers.
      ffn_inner_dim: The inner dimension of the feed forward layers.
      dropout: The probability to drop units in each layer output.
      attention_dropout: The probability to drop units from the attention.
      ffn_dropout: The probability to drop units from the ReLU activation in
        the feed forward layer.
      ffn_activation: The activation function to apply between the two linear
        transformations of the feed forward layer.
      position_encoder_class: The :class:`opennmt.layers.PositionEncoder`
        class to use for position encoding (or a callable that returns an
        instance).
      share_embeddings: Level of embeddings sharing, see
        :class:`opennmt.models.EmbeddingsSharingLevel` for possible values.
      share_encoders: In case of multi source architecture, whether to share the
        separate encoders parameters or not.
      maximum_relative_position: Maximum relative position representation
        (from https://arxiv.org/abs/1803.02155).
    """
        if isinstance(num_layers, (list, tuple)):
            num_encoder_layers, num_decoder_layers = num_layers
        else:
            num_encoder_layers, num_decoder_layers = num_layers, num_layers
        encoders = [
            SelfAttentionEncoder(
                num_encoder_layers,
                num_units=num_units,
                num_heads=num_heads,
                ffn_inner_dim=ffn_inner_dim,
                dropout=dropout,
                attention_dropout=attention_dropout,
                ffn_dropout=ffn_dropout,
                ffn_activation=ffn_activation,
                position_encoder_class=position_encoder_class,
                maximum_relative_position=maximum_relative_position)
            for _ in range(source_inputter.num_outputs)
        ]
        if len(encoders) > 1:
            encoder = ParallelEncoder(
                encoders if not share_encoders else encoders[0],
                outputs_reducer=None,
                states_reducer=None)
        else:
            encoder = encoders[0]
        decoder = SelfAttentionDecoder(
            num_decoder_layers,
            num_units=num_units,
            num_heads=num_heads,
            ffn_inner_dim=ffn_inner_dim,
            dropout=dropout,
            attention_dropout=attention_dropout,
            ffn_dropout=ffn_dropout,
            ffn_activation=ffn_activation,
            position_encoder_class=position_encoder_class,
            num_sources=source_inputter.num_outputs,
            maximum_relative_position=maximum_relative_position)

        self._num_units = num_units
        self._num_encoder_layers = num_encoder_layers
        self._num_decoder_layers = num_decoder_layers
        self._num_heads = num_heads
        self._with_relative_position = maximum_relative_position is not None
        self._is_ct2_compatible = (
            isinstance(encoder, SelfAttentionEncoder)
            and num_encoder_layers == num_decoder_layers
            and ffn_activation == tf.nn.relu and
            ((self._with_relative_position and position_encoder_class is None)
             or (not self._with_relative_position
                 and position_encoder_class == SinusoidalPositionEncoder)))
        super(Transformer, self).__init__(source_inputter,
                                          target_inputter,
                                          encoder,
                                          decoder,
                                          share_embeddings=share_embeddings)
Example #3
0
    def __init__(self,
                 source_inputter,
                 target_inputter,
                 num_layers,
                 num_units,
                 num_heads,
                 ffn_inner_dim,
                 dropout=0.1,
                 attention_dropout=0.1,
                 ffn_dropout=0.1,
                 ffn_activation=tf.nn.relu,
                 position_encoder_class=SinusoidalPositionEncoder,
                 share_embeddings=EmbeddingsSharingLevel.NONE,
                 share_encoders=False,
                 maximum_relative_position=None,
                 attention_span=None,
                 num_attended_heads=1):
        """Initializes a Transformer model.

    Args:
      source_inputter: A :class:`opennmt.inputters.Inputter` to process
        the source data. If this inputter returns parallel inputs, a multi
        source Transformer architecture will be constructed.
      target_inputter: A :class:`opennmt.inputters.Inputter` to process
        the target data. Currently, only the
        :class:`opennmt.inputters.WordEmbedder` is supported.
      num_layers: The shared number of layers.
      num_units: The number of hidden units.
      num_heads: The number of heads in each self-attention layers.
      ffn_inner_dim: The inner dimension of the feed forward layers.
      dropout: The probability to drop units in each layer output.
      attention_dropout: The probability to drop units from the attention.
      ffn_dropout: The probability to drop units from the ReLU activation in
        the feed forward layer.
      ffn_activation: The activation function to apply between the two linear
        transformations of the feed forward layer.
      position_encoder_class: The :class:`opennmt.layers.PositionEncoder`
        class to use for position encoding (or a callable that returns an
        instance).
      share_embeddings: Level of embeddings sharing, see
        :class:`opennmt.models.EmbeddingsSharingLevel` for possible values.
      share_encoders: In case of multi source architecture, whether to share the
        separate encoders parameters or not.
      maximum_relative_position: Maximum relative position representation
        (from https://arxiv.org/abs/1803.02155).
      attention_span: Maximum relative position to attend to
        (from https://arxiv.org/abs/1904.03107).
      num_attended_heads: How many heads should be attended. Defaults to 1
        as each head only attends to itself in vanilla Transformer. Increase to
        an odd number < `num_heads` to also model head interaction.
        (from ttps://arxiv.org/abs/1904.03107).
    """
        encoders = [
            SelfAttentionEncoder(
                num_layers,
                num_units=num_units,
                num_heads=num_heads,
                ffn_inner_dim=ffn_inner_dim,
                dropout=dropout,
                attention_dropout=attention_dropout,
                ffn_dropout=ffn_dropout,
                ffn_activation=ffn_activation,
                position_encoder_class=position_encoder_class,
                maximum_relative_position=maximum_relative_position,
                attention_span=attention_span,
                num_attended_heads=num_attended_heads)
            for _ in range(source_inputter.num_outputs)
        ]
        if len(encoders) > 1:
            encoder = ParallelEncoder(
                encoders if not share_encoders else encoders[0],
                outputs_reducer=None,
                states_reducer=None)
        else:
            encoder = encoders[0]
        decoder = SelfAttentionDecoder(
            num_layers,
            num_units=num_units,
            num_heads=num_heads,
            ffn_inner_dim=ffn_inner_dim,
            dropout=dropout,
            attention_dropout=attention_dropout,
            ffn_dropout=ffn_dropout,
            ffn_activation=ffn_activation,
            position_encoder_class=position_encoder_class,
            num_sources=source_inputter.num_outputs,
            maximum_relative_position=maximum_relative_position)

        self._num_units = num_units
        super(Transformer, self).__init__(source_inputter,
                                          target_inputter,
                                          encoder,
                                          decoder,
                                          share_embeddings=share_embeddings)
Example #4
0
    def __init__(
        self,
        source_inputter,
        target_inputter,
        num_layers,
        num_units,
        num_heads,
        ffn_inner_dim,
        dropout=0.1,
        attention_dropout=0.1,
        ffn_dropout=0.1,
        ffn_activation=tf.nn.relu,
        position_encoder_class=SinusoidalPositionEncoder,
        share_embeddings=EmbeddingsSharingLevel.NONE,
        share_encoders=False,
        maximum_relative_position=None,
        attention_reduction=MultiHeadAttentionReduction.FIRST_HEAD_LAST_LAYER,
        pre_norm=True,
    ):
        """Initializes a Transformer model.

        Args:
          source_inputter: A :class:`opennmt.inputters.Inputter` to process
            the source data. If this inputter returns parallel inputs, a multi
            source Transformer architecture will be constructed.
          target_inputter: A :class:`opennmt.inputters.Inputter` to process
            the target data. Currently, only the
            :class:`opennmt.inputters.WordEmbedder` is supported.
          num_layers: The number of layers or a 2-tuple with the number of encoder
            layers and decoder layers.
          num_units: The number of hidden units.
          num_heads: The number of heads in each self-attention layers.
          ffn_inner_dim: The inner dimension of the feed forward layers.
          dropout: The probability to drop units in each layer output.
          attention_dropout: The probability to drop units from the attention.
          ffn_dropout: The probability to drop units from the ReLU activation in
            the feed forward layer.
          ffn_activation: The activation function to apply between the two linear
            transformations of the feed forward layer.
          position_encoder_class: The :class:`opennmt.layers.PositionEncoder`
            class to use for position encoding (or a callable that returns an
            instance).
          share_embeddings: Level of embeddings sharing, see
            :class:`opennmt.models.EmbeddingsSharingLevel` for possible values.
          share_encoders: In case of multi source architecture, whether to share the
            separate encoders parameters or not.
          maximum_relative_position: Maximum relative position representation
            (from https://arxiv.org/abs/1803.02155).
          attention_reduction: A :class:`opennmt.layers.MultiHeadAttentionReduction`
            value to specify how to reduce target-source multi-head attention
            matrices.
          pre_norm: If ``True``, layer normalization is applied before each
            sub-layer. Otherwise it is applied after. The original paper uses
            ``pre_norm=False``, but the authors later suggested that ``pre_norm=True``
            "seems better for harder-to-learn models, so it should probably be the
            default."
        """
        if isinstance(num_layers, (list, tuple)):
            num_encoder_layers, num_decoder_layers = num_layers
        else:
            num_encoder_layers, num_decoder_layers = num_layers, num_layers
        encoders = [
            SelfAttentionEncoder(
                num_encoder_layers,
                num_units=num_units,
                num_heads=num_heads,
                ffn_inner_dim=ffn_inner_dim,
                dropout=dropout,
                attention_dropout=attention_dropout,
                ffn_dropout=ffn_dropout,
                ffn_activation=ffn_activation,
                position_encoder_class=position_encoder_class,
                maximum_relative_position=maximum_relative_position,
                pre_norm=pre_norm,
            ) for _ in range(source_inputter.num_outputs)
        ]
        if len(encoders) > 1:
            encoder = ParallelEncoder(
                encoders if not share_encoders else encoders[0],
                outputs_reducer=None,
                states_reducer=None,
            )
        else:
            encoder = encoders[0]
        decoder = SelfAttentionDecoder(
            num_decoder_layers,
            num_units=num_units,
            num_heads=num_heads,
            ffn_inner_dim=ffn_inner_dim,
            dropout=dropout,
            attention_dropout=attention_dropout,
            ffn_dropout=ffn_dropout,
            ffn_activation=ffn_activation,
            position_encoder_class=position_encoder_class,
            num_sources=source_inputter.num_outputs,
            maximum_relative_position=maximum_relative_position,
            attention_reduction=attention_reduction,
            pre_norm=pre_norm,
        )

        self._num_units = num_units
        self._num_encoder_layers = num_encoder_layers
        self._num_decoder_layers = num_decoder_layers
        self._num_heads = num_heads
        self._with_relative_position = maximum_relative_position is not None
        self._is_ct2_compatible = (
            isinstance(encoder, SelfAttentionEncoder) and pre_norm
            and ffn_activation is tf.nn.relu and
            ((self._with_relative_position and position_encoder_class is None)
             or (not self._with_relative_position
                 and position_encoder_class == SinusoidalPositionEncoder)))
        super().__init__(
            source_inputter,
            target_inputter,
            encoder,
            decoder,
            share_embeddings=share_embeddings,
        )