def __init__(self, source_inputter, target_inputter, num_layers, num_units, num_heads, ffn_inner_dim, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=SinusoidalPositionEncoder(), decoder_self_attention_type="scaled_dot", share_embeddings=EmbeddingsSharingLevel.NONE, share_encoders=False, alignment_file_key="train_alignments", name="transformer"): """Initializes a Transformer model. Args: source_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process the source data. If this inputter returns parallel inputs, a multi source Transformer architecture will be constructed. target_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process the target data. Currently, only the :class:`opennmt.inputters.text_inputter.WordEmbedder` is supported. num_layers: The shared number of layers. num_units: The number of hidden units. num_heads: The number of heads in each self-attention layers. ffn_inner_dim: The inner dimension of the feed forward layers. dropout: The probability to drop units in each layer output. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to apply on the inputs. decoder_self_attention_type: Type of self attention in the decoder, "scaled_dot" or "average" (case insensitive). share_embeddings: Level of embeddings sharing, see :class:`opennmt.models.sequence_to_sequence.EmbeddingsSharingLevel` for possible values. share_encoders: In case of multi source architecture, whether to share the separate encoders parameters or not. alignment_file_key: The data configuration key of the training alignment file to support guided alignment. name: The name of this model. """ encoders = [ SelfAttentionEncoder(num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, position_encoder=position_encoder) for _ in range(source_inputter.num_outputs) ] if len(encoders) > 1: encoder = ParallelEncoder(encoders, outputs_reducer=None, states_reducer=None, share_parameters=share_encoders) else: encoder = encoders[0] decoder = SelfAttentionDecoder( num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, position_encoder=position_encoder, self_attention_type=decoder_self_attention_type) self._num_units = num_units super(Transformer, self).__init__(source_inputter, target_inputter, encoder, decoder, share_embeddings=share_embeddings, alignment_file_key=alignment_file_key, daisy_chain_variables=True, name=name)
def __init__(self, source_inputter, target_inputter, num_layers, num_units, num_heads, ffn_inner_dim, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, share_embeddings=EmbeddingsSharingLevel.NONE, share_encoders=False, maximum_relative_position=None): """Initializes a Transformer model. Args: source_inputter: A :class:`opennmt.inputters.Inputter` to process the source data. If this inputter returns parallel inputs, a multi source Transformer architecture will be constructed. target_inputter: A :class:`opennmt.inputters.Inputter` to process the target data. Currently, only the :class:`opennmt.inputters.WordEmbedder` is supported. num_layers: The number of layers or a 2-tuple with the number of encoder layers and decoder layers. num_units: The number of hidden units. num_heads: The number of heads in each self-attention layers. ffn_inner_dim: The inner dimension of the feed forward layers. dropout: The probability to drop units in each layer output. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the ReLU activation in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). share_embeddings: Level of embeddings sharing, see :class:`opennmt.models.EmbeddingsSharingLevel` for possible values. share_encoders: In case of multi source architecture, whether to share the separate encoders parameters or not. maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). """ if isinstance(num_layers, (list, tuple)): num_encoder_layers, num_decoder_layers = num_layers else: num_encoder_layers, num_decoder_layers = num_layers, num_layers encoders = [ SelfAttentionEncoder( num_encoder_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, position_encoder_class=position_encoder_class, maximum_relative_position=maximum_relative_position) for _ in range(source_inputter.num_outputs) ] if len(encoders) > 1: encoder = ParallelEncoder( encoders if not share_encoders else encoders[0], outputs_reducer=None, states_reducer=None) else: encoder = encoders[0] decoder = SelfAttentionDecoder( num_decoder_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, position_encoder_class=position_encoder_class, num_sources=source_inputter.num_outputs, maximum_relative_position=maximum_relative_position) self._num_units = num_units self._num_encoder_layers = num_encoder_layers self._num_decoder_layers = num_decoder_layers self._num_heads = num_heads self._with_relative_position = maximum_relative_position is not None self._is_ct2_compatible = ( isinstance(encoder, SelfAttentionEncoder) and num_encoder_layers == num_decoder_layers and ffn_activation == tf.nn.relu and ((self._with_relative_position and position_encoder_class is None) or (not self._with_relative_position and position_encoder_class == SinusoidalPositionEncoder))) super(Transformer, self).__init__(source_inputter, target_inputter, encoder, decoder, share_embeddings=share_embeddings)
def __init__(self, source_inputter, target_inputter, num_layers, num_units, num_heads, ffn_inner_dim, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, share_embeddings=EmbeddingsSharingLevel.NONE, share_encoders=False, maximum_relative_position=None, attention_span=None, num_attended_heads=1): """Initializes a Transformer model. Args: source_inputter: A :class:`opennmt.inputters.Inputter` to process the source data. If this inputter returns parallel inputs, a multi source Transformer architecture will be constructed. target_inputter: A :class:`opennmt.inputters.Inputter` to process the target data. Currently, only the :class:`opennmt.inputters.WordEmbedder` is supported. num_layers: The shared number of layers. num_units: The number of hidden units. num_heads: The number of heads in each self-attention layers. ffn_inner_dim: The inner dimension of the feed forward layers. dropout: The probability to drop units in each layer output. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the ReLU activation in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). share_embeddings: Level of embeddings sharing, see :class:`opennmt.models.EmbeddingsSharingLevel` for possible values. share_encoders: In case of multi source architecture, whether to share the separate encoders parameters or not. maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). attention_span: Maximum relative position to attend to (from https://arxiv.org/abs/1904.03107). num_attended_heads: How many heads should be attended. Defaults to 1 as each head only attends to itself in vanilla Transformer. Increase to an odd number < `num_heads` to also model head interaction. (from ttps://arxiv.org/abs/1904.03107). """ encoders = [ SelfAttentionEncoder( num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, position_encoder_class=position_encoder_class, maximum_relative_position=maximum_relative_position, attention_span=attention_span, num_attended_heads=num_attended_heads) for _ in range(source_inputter.num_outputs) ] if len(encoders) > 1: encoder = ParallelEncoder( encoders if not share_encoders else encoders[0], outputs_reducer=None, states_reducer=None) else: encoder = encoders[0] decoder = SelfAttentionDecoder( num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, position_encoder_class=position_encoder_class, num_sources=source_inputter.num_outputs, maximum_relative_position=maximum_relative_position) self._num_units = num_units super(Transformer, self).__init__(source_inputter, target_inputter, encoder, decoder, share_embeddings=share_embeddings)
def __init__( self, source_inputter, target_inputter, num_layers, num_units, num_heads, ffn_inner_dim, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, share_embeddings=EmbeddingsSharingLevel.NONE, share_encoders=False, maximum_relative_position=None, attention_reduction=MultiHeadAttentionReduction.FIRST_HEAD_LAST_LAYER, pre_norm=True, ): """Initializes a Transformer model. Args: source_inputter: A :class:`opennmt.inputters.Inputter` to process the source data. If this inputter returns parallel inputs, a multi source Transformer architecture will be constructed. target_inputter: A :class:`opennmt.inputters.Inputter` to process the target data. Currently, only the :class:`opennmt.inputters.WordEmbedder` is supported. num_layers: The number of layers or a 2-tuple with the number of encoder layers and decoder layers. num_units: The number of hidden units. num_heads: The number of heads in each self-attention layers. ffn_inner_dim: The inner dimension of the feed forward layers. dropout: The probability to drop units in each layer output. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the ReLU activation in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). share_embeddings: Level of embeddings sharing, see :class:`opennmt.models.EmbeddingsSharingLevel` for possible values. share_encoders: In case of multi source architecture, whether to share the separate encoders parameters or not. maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). attention_reduction: A :class:`opennmt.layers.MultiHeadAttentionReduction` value to specify how to reduce target-source multi-head attention matrices. pre_norm: If ``True``, layer normalization is applied before each sub-layer. Otherwise it is applied after. The original paper uses ``pre_norm=False``, but the authors later suggested that ``pre_norm=True`` "seems better for harder-to-learn models, so it should probably be the default." """ if isinstance(num_layers, (list, tuple)): num_encoder_layers, num_decoder_layers = num_layers else: num_encoder_layers, num_decoder_layers = num_layers, num_layers encoders = [ SelfAttentionEncoder( num_encoder_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, position_encoder_class=position_encoder_class, maximum_relative_position=maximum_relative_position, pre_norm=pre_norm, ) for _ in range(source_inputter.num_outputs) ] if len(encoders) > 1: encoder = ParallelEncoder( encoders if not share_encoders else encoders[0], outputs_reducer=None, states_reducer=None, ) else: encoder = encoders[0] decoder = SelfAttentionDecoder( num_decoder_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, position_encoder_class=position_encoder_class, num_sources=source_inputter.num_outputs, maximum_relative_position=maximum_relative_position, attention_reduction=attention_reduction, pre_norm=pre_norm, ) self._num_units = num_units self._num_encoder_layers = num_encoder_layers self._num_decoder_layers = num_decoder_layers self._num_heads = num_heads self._with_relative_position = maximum_relative_position is not None self._is_ct2_compatible = ( isinstance(encoder, SelfAttentionEncoder) and pre_norm and ffn_activation is tf.nn.relu and ((self._with_relative_position and position_encoder_class is None) or (not self._with_relative_position and position_encoder_class == SinusoidalPositionEncoder))) super().__init__( source_inputter, target_inputter, encoder, decoder, share_embeddings=share_embeddings, )