def __init__(self, word_emb, n_layer, n_head, d_model, d_head, d_inner, dropout, drop_att, pre_layer_norm, device): super(TransformerEncoder, self).__init__() self.position_emb = PositionalEmbedding(demb=d_model) self.word_emb = word_emb self.layers = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_head, dropout, drop_att, pre_layer_norm, avg=False, rel=False) for _ in range(n_layer)]) self.device = device
def __init__(self, d_model, d_inner, n_head, d_head, dropout, drop_att, pre_layer_norm, avg, rel): super(DecoderLayer, self).__init__() self.slf_attn = MixMultiHeadAttention(n_head, d_model, d_head, dropout, drop_att, pre_layer_norm, avg, rel) self.enc_attn = MixMultiHeadAttention(n_head, d_model, d_head, dropout, drop_att, pre_layer_norm, avg, rel) self.pos_ffn = PositionWiseFF(d_model, d_inner, dropout) if rel: self.pos_emb = PositionalEmbedding(demb=d_model) self.r_w_bias = nn.Parameter(torch.Tensor(n_head, d_head)) self.r_r_bias = nn.Parameter(torch.Tensor(n_head, d_head)) nn.init.normal_(self.r_w_bias, 0.0, 0.1) nn.init.normal_(self.r_r_bias, 0.0, 0.1) self.rel = rel
def __init__(self, word_emb, n_head, d_model, d_head, d_inner, dropout, drop_att, pre_layer_norm, device): super(RelHDSADecoder, self).__init__() self.word_emb = word_emb self.pos_emb = PositionalEmbedding(demb=d_model) self.prior_layer = DecoderLayer(d_model, d_inner, len(domains), d_head, dropout, drop_att, pre_layer_norm, avg=True, rel=True) self.middle_layer = DecoderLayer(d_model, d_inner, len(functions), d_head, dropout, drop_att, pre_layer_norm, avg=True, rel=True) self.post_layer = DecoderLayer(d_model, d_inner, len(arguments), d_head, dropout, drop_att, pre_layer_norm, avg=True, rel=True) self.final_layer = DecoderLayer(d_model, d_inner, n_head, d_head, dropout, drop_att, pre_layer_norm, avg=False, rel=True) self.device = device