def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, maximum_relative_position=None, pre_norm=True, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). pre_norm: If ``True``, layer normalization is applied before each sub-layer. Otherwise it is applied after. **kwargs: Additional layer arguments. """ super().__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = None if position_encoder_class is not None: self.position_encoder = position_encoder_class() self.layer_norm = common.LayerNorm() if pre_norm else None self.layers = [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, maximum_relative_position=maximum_relative_position, pre_norm=pre_norm, ) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). """ super(SelfAttentionEncoder, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = None if position_encoder_class is not None: self.position_encoder = position_encoder_class() self.layer_norm = common.LayerNorm() self.layers = [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, maximum_relative_position=None, attention_span=None, num_attended_heads=1, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). attention_span: Maximum relative position to attend to (from https://arxiv.org/abs/1904.03107). num_attended_heads: How many heads should be attended. Defaults to 1 as each head only attends to itself in vanilla Transformer. Increase to an odd number < `num_heads` to also model head interaction. (from ttps://arxiv.org/abs/1904.03107). **kwargs: Additional layer arguments. """ super(SelfAttentionEncoder, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = None if position_encoder_class is not None: self.position_encoder = position_encoder_class() self.layer_norm = common.LayerNorm() if attention_span is None: num_unconstrained_layers = num_layers else: num_unconstrained_layers = math.floor(num_layers / 2) num_constrained_layers = num_layers - num_unconstrained_layers self.layers = [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, maximum_relative_position=maximum_relative_position, attention_span=attention_span, num_attended_heads=num_attended_heads) for _ in range(num_constrained_layers)] self.layers += [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, maximum_relative_position=maximum_relative_position) for _ in range(num_unconstrained_layers)]