def __init__(self, d_model, num_heads, dff, rate=0.1): super(DecoderLayer, self).__init__() self.mha1 = MultiHeadAttention(d_model, num_heads) self.mha2 = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate)
def __init__(self, d_model=512, d_k=64, d_v=64, h=8, d_ff=2048, dropout=.1, identity_map_reordering=False, attention_module=None, attention_module_kwargs=None): super(EncoderLayer, self).__init__() self.identity_map_reordering = identity_map_reordering self.mhatt = MultiHeadAttention( d_model, d_k, d_v, h, dropout, identity_map_reordering=identity_map_reordering, attention_module=attention_module, attention_module_kwargs=attention_module_kwargs) self.pwff = PositionWiseFeedForward( d_model, d_ff, dropout, identity_map_reordering=identity_map_reordering)
def __init__(self, d_model=512, d_k=64, d_v=64, h=8, d_ff=2048, dropout=.1, self_att_module=None, enc_att_module=None, self_att_module_kwargs=None, enc_att_module_kwargs=None): super(MeshedDecoderLayer, self).__init__() self.self_att = MultiHeadAttention(d_model, d_k, d_v, h, dropout, can_be_stateful=True, attention_module=self_att_module, attention_module_kwargs=self_att_module_kwargs) self.enc_att = MultiHeadAttention(d_model, d_k, d_v, h, dropout, can_be_stateful=False, attention_module=enc_att_module, attention_module_kwargs=enc_att_module_kwargs) self.pwff = PositionWiseFeedForward(d_model, d_ff, dropout) self.fc_alpha1 = nn.Linear(d_model + d_model, d_model) self.fc_alpha2 = nn.Linear(d_model + d_model, d_model) self.fc_alpha3 = nn.Linear(d_model + d_model, d_model) self.init_weights()
def __init__(self, hid_dim, emb_dim, score_fn=F.softmax, scaling_energy=False, multi_head=False, d_keys_values=64, dropout=0.): super().__init__() self.score_fn = score_fn self.scale = torch.sqrt(torch.FloatTensor([0.5])) self.multi_head = multi_head self.scaling_energy = torch.sqrt(torch.Tensor( [emb_dim])) if scaling_energy else 1 self.attention_hid2emb = nn.Linear(hid_dim, emb_dim) self.attention_emb2hid = nn.Linear(emb_dim, hid_dim) self.p_attention_emb2hid = nn.Linear(emb_dim, hid_dim) if multi_head: self.multi_head_att = MultiHeadAttention(emb_dim, d_keys_values, d_keys_values, emb_dim // d_keys_values, dropout=dropout)
def __init__(self, d_model, num_heads, dff, rate=0.1): super(LastDecoderLayer, self).__init__() self.mha1 = MultiHeadAttention(d_model, num_heads) self.mha2 = PointerMultiHeadAttention(d_model, num_heads) self.pointer_attention = PointerAttention() self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate)
def __init__(self, d_model=512, d_k=64, d_v=64, h=8, d_ff=2048, dropout=.1, identity_map_reordering=False, self_att_module=None, enc_att_module=None, self_att_module_kwargs=None, enc_att_module_kwargs=None): super(DecoderLayer, self).__init__() self.self_att = MultiHeadAttention( d_model, d_k, d_v, h, dropout, can_be_stateful=True, identity_map_reordering=identity_map_reordering, attention_module=self_att_module, attention_module_kwargs=self_att_module_kwargs) self.enc_att = MultiHeadAttention( d_model, d_k, d_v, h, dropout, can_be_stateful=False, identity_map_reordering=identity_map_reordering, attention_module=enc_att_module, attention_module_kwargs=enc_att_module_kwargs) self.pwff = PositionWiseFeedForward( d_model, d_ff, dropout, identity_map_reordering=identity_map_reordering)