def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, num_sources=1, maximum_relative_position=None, **kwargs): """Initializes the parameters of the decoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). num_sources: The number of source contexts expected by this decoder. maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). **kwargs: Additional layer arguments. """ super(SelfAttentionDecoder, self).__init__(num_sources=num_sources, **kwargs) self.num_units = num_units self.num_heads = num_heads self.dropout = dropout self.position_encoder = None if position_encoder_class is not None: self.position_encoder = position_encoder_class() self.layer_norm = common.LayerNorm() self.layers = [ transformer.SelfAttentionDecoderLayer( self.num_units, self.num_heads, ffn_inner_dim, num_sources=num_sources, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, maximum_relative_position=maximum_relative_position) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=768, num_heads=12, ffn_inner_dim=3072, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, embedding_table=None, position_encoder=SinusoidalPositionEncoder(), num_sources=1, **kwargs): """Initializes the parameters of the decoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to apply on inputs. num_sources: The number of source contexts expected by this decoder. **kwargs: Additional layer arguments. """ super(SelfAttentionDecoderV2, self).__init__(num_sources=num_sources, **kwargs) self.num_units = num_units self.num_heads = num_heads self.dropout = dropout self.embedding_table = embedding_table self.position_encoder = position_encoder self.layer_norm = common.LayerNorm(name="output_norm") self.layers = [ _SelfAttentionDecoderLayer(self.num_units, self.num_heads, ffn_inner_dim, num_sources=num_sources, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, name="layer_%d" % i) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). """ super(SelfAttentionEncoder, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = None if position_encoder_class is not None: self.position_encoder = position_encoder_class() self.layer_norm = common.LayerNorm() self.layers = [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder=SinusoidalPositionEncoder(), max_relative_positions=0, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to apply on inputs. """ super(SelfAttentionEncoderV2, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = position_encoder self.layer_norm = common.LayerNorm() self.layers = [ _SelfAttentionEncoderLayer(num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, name="layer_%d" % i) for i in range(num_layers) ]
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, position_encoder=None, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. relu_dropout: The probability to drop units from the ReLU activation in the feed forward layer. position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to apply on inputs. If ``None``, defaults to :class:`opennmt.layers.position.SinusoidalPositionEncoder`. """ super(SelfAttentionEncoderV2, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = position_encoder if self.position_encoder is None: self.position_encoder = SinusoidalPositionEncoder() self.layer_norm = common.LayerNorm() self.layers = [ _SelfAttentionEncoderLayer(num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout, name="layer_%d" % i) for i in range(num_layers) ]
def testLayerNorm(self): layer_norm = common.LayerNorm() x = tf.random.uniform([4, 10]) y = layer_norm(x) self.assertEqual(y.shape, x.shape)
def __init__(self, num_layers, num_units=512, num_heads=8, ffn_inner_dim=2048, dropout=0.1, attention_dropout=0.1, ffn_dropout=0.1, ffn_activation=tf.nn.relu, position_encoder_class=SinusoidalPositionEncoder, maximum_relative_position=None, attention_span=None, num_attended_heads=1, **kwargs): """Initializes the parameters of the encoder. Args: num_layers: The number of layers. num_units: The number of hidden units. num_heads: The number of heads in the multi-head attention. ffn_inner_dim: The number of units of the inner linear transformation in the feed forward layer. dropout: The probability to drop units from the outputs. attention_dropout: The probability to drop units from the attention. ffn_dropout: The probability to drop units from the activation output in the feed forward layer. ffn_activation: The activation function to apply between the two linear transformations of the feed forward layer. position_encoder_class: The :class:`opennmt.layers.PositionEncoder` class to use for position encoding (or a callable that returns an instance). maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). attention_span: Maximum relative position to attend to (from https://arxiv.org/abs/1904.03107). num_attended_heads: How many heads should be attended. Defaults to 1 as each head only attends to itself in vanilla Transformer. Increase to an odd number < `num_heads` to also model head interaction. (from ttps://arxiv.org/abs/1904.03107). **kwargs: Additional layer arguments. """ super(SelfAttentionEncoder, self).__init__(**kwargs) self.num_units = num_units self.dropout = dropout self.position_encoder = None if position_encoder_class is not None: self.position_encoder = position_encoder_class() self.layer_norm = common.LayerNorm() if attention_span is None: num_unconstrained_layers = num_layers else: num_unconstrained_layers = math.floor(num_layers / 2) num_constrained_layers = num_layers - num_unconstrained_layers self.layers = [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, maximum_relative_position=maximum_relative_position, attention_span=attention_span, num_attended_heads=num_attended_heads) for _ in range(num_constrained_layers)] self.layers += [ transformer.SelfAttentionEncoderLayer( num_units, num_heads, ffn_inner_dim, dropout=dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, ffn_activation=ffn_activation, maximum_relative_position=maximum_relative_position) for _ in range(num_unconstrained_layers)]