def __init__(self, d_model, d_sent, d_con=512, heads=8, d_ff=2048, dropout=0.1, att_drop=0.1, activation="relu", dual_enc=True): super(DoubleAttnTransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, query_dim=d_model, dropout=att_drop, use_structure=False) self.dual_enc = dual_enc self.d_sent = d_sent self.d_con = d_con self.sent_cross_attn = MultiHeadedAttention( heads, d_sent, query_dim=d_model, dropout=att_drop, use_structure=False ) if self.d_sent != d_model and not dual_enc: self.kv_map = nn.Linear(self.d_sent, d_model) else: self.kv_map = None n_graph_head = 4 if self.d_con != 512 else 8 if dual_enc: self.graph_cross_attn = MultiHeadedAttention( n_graph_head, self.d_con, query_dim=d_model, dropout=att_drop, use_structure=False ) self.fuse_linear = nn.Linear(self.d_sent + self.d_con, d_model) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, d_ff) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ff, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, d_enc, heads, d_ff=2048, dropout=0.1, att_drop=0.1, activation="relu", dual_enc=True): super(DoubleAttnTransformerDecoderLayerGraphFirst, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, query_dim=d_model, dropout=att_drop, use_structure=False) self.dual_enc = dual_enc self.d_enc = d_enc self.sent_cross_attn = MultiHeadedAttention( heads, d_model, query_dim=d_model, dropout=att_drop, use_structure=False ) if dual_enc: self.graph_cross_attn = MultiHeadedAttention( heads, self.d_enc - d_model, query_dim=d_model, dropout=att_drop, use_structure=False ) # self.fuse_linear = nn.Linear(self.d_enc, d_model) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, d_ff) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ff, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm_g = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout_g = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__( self, d_model, heads, d_ff=2048, dropout=0.1, att_drop=0.1, use_structure=True, alpha=1.0, beta=1.0, activation="relu", ): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, query_dim=d_model, dropout=att_drop, use_structure=use_structure, alpha=alpha, beta=beta ) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, d_ff) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ff, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)