def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=SinusoidalPositionEncoder()): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to apply on inputs or ``None``. """ self.num_layers = num_layers self.num_units = num_units self.num_heads = num_heads self.ffn_inner_dim = ffn_inner_dim self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.position_encoder = position_encoder
def __init__(self, source_inputter, target_inputter, num_layers, num_units, num_heads, ffn_inner_dim, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=SinusoidalPositionEncoder(), decoder_self_attention_type="scaled_dot", name="transformer"): """Initializes a Transformer model. Args: source_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process the source data. target_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process the target data. Currently, only the :class:`opennmt.inputters.text_inputter.WordEmbedder` is supported. num_layers: The shared number of layers. num_units: The number of hidden units. num_heads: The number of heads in each self-attention layers. ffn_inner_dim: The inner dimension of the feed forward layers. dropout: The probability to drop units in each layer output. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to apply on the inputs. decoder_self_attention_type: Type of self attention in the decoder, "scaled_dot" or "average" (case insensitive). name: The name of this model. """ encoder = SelfAttentionEncoder(num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, position_encoder=position_encoder) decoder = SelfAttentionDecoder( num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, position_encoder=position_encoder, self_attention_type=decoder_self_attention_type) super(Transformer, self).__init__(source_inputter, target_inputter, encoder, decoder, daisy_chain_variables=True, name=name)
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder=SinusoidalPositionEncoder(), num_sources=1, **kwargs): """Initializes the parameters of the decoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to apply on inputs. num_sources: The number of source contexts expected by this decoder. **kwargs: Additional layer arguments. """ super(SelfAttentionDecoderV2, self).__init__(num_sources=num_sources, **kwargs) self.num_units = num_units self.num_heads = num_heads self.dropout = dropout self.position_encoder = position_encoder self.layer_norm = common.LayerNorm(name="output_norm") self.layers = [ _SelfAttentionDecoderLayer(self.num_units, self.num_heads, ffn_inner_dim, num_sources=num_sources, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, name="layer_%d" % i) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=SinusoidalPositionEncoder(), self_attention_type="scaled_dot"): """Initializes the parameters of the decoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to apply on inputs or ``None``. self_attention_type: Type of self attention, "scaled_dot" or "average" (case insensitive). Raises: ValueError: if :obj:`self_attention_type` is invalid. """ self.num_layers = num_layers self.num_units = num_units self.num_heads = num_heads self.ffn_inner_dim = ffn_inner_dim self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.position_encoder = position_encoder self.self_attention_type = self_attention_type.lower() if self.self_attention_type not in ("scaled_dot", "average"): raise ValueError("invalid attention type %s" % self.self_attention_type) if self.self_attention_type == "average": tf.logging.warning( "Support for average attention network is experimental " "and may change in future versions.")
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=None, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to apply on inputs. If ``None``, defaults to :class:`opennmt.layers.position.SinusoidalPositionEncoder`. """ super(SelfAttentionEncoderV2, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = position_encoder if self.position_encoder is None: self.position_encoder = SinusoidalPositionEncoder() self.layer_norm = common.LayerNorm() self.layers = [ _SelfAttentionEncoderLayer(num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, name="layer_%d" % i) for i in range(num_layers) ]
def __init__(self, source_inputter, target_inputter, num_layers, num_units, num_heads, ffn_inner_dim, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=SinusoidalPositionEncoder(), decoder_self_attention_type="scaled_dot", share_embeddings=EmbeddingsSharingLevel.NONE, share_encoders=False, alignment_file_key="train_alignments", name="transformer"): """Initializes a Transformer model. Args: source_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process the source data. If this inputter returns parallel inputs, a multi source Transformer architecture will be constructed. target_inputter: A :class:`opennmt.inputters.inputter.Inputter` to process the target data. Currently, only the :class:`opennmt.inputters.text_inputter.WordEmbedder` is supported. num_layers: The shared number of layers. num_units: The number of hidden units. num_heads: The number of heads in each self-attention layers. ffn_inner_dim: The inner dimension of the feed forward layers. dropout: The probability to drop units in each layer output. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to apply on the inputs. decoder_self_attention_type: Type of self attention in the decoder, "scaled_dot" or "average" (case insensitive). share_embeddings: Level of embeddings sharing, see :class:`opennmt.models.sequence_to_sequence.EmbeddingsSharingLevel` for possible values. share_encoders: In case of multi source architecture, whether to share the separate encoders parameters or not. alignment_file_key: The data configuration key of the training alignment file to support guided alignment. name: The name of this model. """ encoders = [ SelfAttentionEncoder(num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, position_encoder=position_encoder) for _ in range(source_inputter.num_outputs) ] if len(encoders) > 1: encoder = ParallelEncoder(encoders, outputs_reducer=None, states_reducer=None, share_parameters=share_encoders) else: encoder = encoders[0] decoder = SelfAttentionDecoder( num_layers, num_units=num_units, num_heads=num_heads, ffn_inner_dim=ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, position_encoder=position_encoder, self_attention_type=decoder_self_attention_type) self._num_units = num_units super(Transformer, self).__init__(source_inputter, target_inputter, encoder, decoder, share_embeddings=share_embeddings, alignment_file_key=alignment_file_key, daisy_chain_variables=True, name=name)