def __init__(self, head_size, num_heads, dropout=0.0, kernel_regularizer=L2, bias_regularizer=L2, name="mhsa_module", **kwargs): super(MHSAModule, self).__init__(name=name, **kwargs) self.pc = PositionalEncoding(name=f"{name}_pe") self.ln = tf.keras.layers.LayerNormalization( name=f"{name}_ln", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer) self.mha = tfa.layers.MultiHeadAttention( head_size=head_size, num_heads=num_heads, name=f"{name}_mhsa", kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") self.res_add = tf.keras.layers.Add(name=f"{name}_add")
def __init__(self, vocab_size, embed_dim, dropout, att_type, att_heads, att_mid_dim, att_mid_drop, bifeat_emb_act, bifeat_emb_drop, ff_dropout, layer_num): super(Decoder, self).__init__() self.att_heads = att_heads self.layers = nn.ModuleList([]) self.embed_dim = embed_dim for i in range(layer_num): sublayer = DecoderLayer(embed_dim=embed_dim, dropout=dropout, att_type=att_type, att_heads=att_heads, att_mid_dim=att_mid_dim, att_mid_drop=att_mid_drop, bifeat_emb_act=bifeat_emb_act, bifeat_emb_drop=bifeat_emb_drop, ff_dropout=ff_dropout, last_layer=(i == layer_num - 1)) self.layers.append(sublayer) self.dropout = nn.Dropout(cfg.MODEL.DROPOUT_WORD_EMBED) self.embed_tokens = nn.Embedding(vocab_size, embed_dim) self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEncoding( embed_dim, cfg.MODEL.TRANSFORMER.PE_MAX_LEN) self.layer_norm_word = torch.nn.LayerNorm(embed_dim) self.generator = nn.Linear(embed_dim, vocab_size) self.wbil1 = nn.Sequential(nn.Linear(embed_dim, embed_dim), utils.activation(cfg.MODEL.BILINEAR.ACT), torch.nn.LayerNorm(embed_dim)) self.wbil2 = nn.Sequential(nn.Linear(embed_dim, embed_dim), utils.activation(cfg.MODEL.BILINEAR.ACT), torch.nn.LayerNorm(embed_dim)) self.wbi_drop = nn.Dropout(cfg.MODEL.BILINEAR.DECODE_DROPOUT) self.dropout_lm = nn.Dropout(cfg.MODEL.DROPOUT_LM) self.proj_norm = nn.Sequential( nn.Linear(embed_dim * (layer_num + 1), 2 * embed_dim), nn.GLU(), torch.nn.LayerNorm(embed_dim)) self.clear_buffer()
class MHSAModule(tf.keras.layers.Layer): def __init__(self, head_size, num_heads, dropout=0.0, kernel_regularizer=L2, bias_regularizer=L2, name="mhsa_module", **kwargs): super(MHSAModule, self).__init__(name=name, **kwargs) self.pc = PositionalEncoding(name=f"{name}_pe") self.ln = tf.keras.layers.LayerNormalization( name=f"{name}_ln", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer) self.mha = tfa.layers.MultiHeadAttention( head_size=head_size, num_heads=num_heads, name=f"{name}_mhsa", kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") self.res_add = tf.keras.layers.Add(name=f"{name}_add") def call(self, inputs, **kwargs): outputs = self.pc(inputs) outputs = self.ln(outputs) outputs = self.mha([outputs, outputs, outputs]) outputs = self.do(outputs) outputs = self.res_add([inputs, outputs]) return outputs def get_config(self): conf = super(MHSAModule, self).get_config() conf.update(self.pc.get_config()) conf.update(self.ln.get_config()) conf.update(self.mha.get_config()) conf.update(self.do.get_config()) conf.update(self.res_add.get_config()) return conf
def _build_block(self): self.layers.append(PositionalEncoding()) for i in range(self.n_convs): self.layers.extend(self._build_residual(nested_layer='conv', name=self.name+"_conv{}".format(i))) self.layers.extend(self._build_residual(nested_layer='attn', name=self.name+'_selfAttn')) self.layers.extend(self._build_residual(nested_layer='pwffn', name=self.name+"_pwffn"))