def __init__(self, d_model, heads, d_ff, dropout): super().__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings): super(TransformerDecoder, self).__init__() # Basic attributes. self.decoder_type = 'transformer' self.num_layers = num_layers self.embeddings = embeddings self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim) # self.context_attn_graph = MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.drop_3 = nn.Dropout(dropout) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # Build TransformerDecoder. self.transformer_layers = nn.ModuleList( [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.att_weight_c = nn.Linear(self.embeddings.embedding_dim, 1) self.att_weight_q = nn.Linear(self.embeddings.embedding_dim, 1) self.att_weight_cq = nn.Linear(self.embeddings.embedding_dim, 1) self.graph_act = gelu self.graph_aware = nn.Linear(self.embeddings.embedding_dim*3, self.embeddings.embedding_dim) self.graph_drop = nn.Dropout(dropout) self.linear_filter = nn.Linear(d_model*2, 1) self.fix_top = torch.tensor((torch.arange(512,0,-1).type(torch.FloatTensor)/512).\ unsqueeze(0).unsqueeze(0).expand(8, 512, -1)).to(self.get_device()) self.fix_top.requires_grad = True self.fix_top = torch.nn.Parameter(self.fix_top, requires_grad=True) self.register_parameter("fix_top", self.fix_top)
def __init__(self, d_model, heads, d_ff, dropout, topic=False, topic_dim=300, split_noise=False): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout, topic=topic, topic_dim=topic_dim, split_noise=split_noise) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, use_universal_transformer): super(TransformerDecoder, self).__init__() # Basic attributes. self.decoder_type = 'transformer' self.num_layers = num_layers self.embeddings = embeddings self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim) # Build TransformerDecoder. self.dim_mismatch = d_model != 768 if self.dim_mismatch: self.linear_custom = nn.Linear(768, d_model) self.linear_custom_reverse = nn.Linear(d_model, 768) print( "TransformerDecoder# Dimension of input is 768, while d_model is {}. Therefore, Adding Upsampling and Downsampling Layer" .format(str(d_model))) self.common_ff = None if use_universal_transformer: print("Using Universal Transformer in Decoder") self.common_ff = PositionwiseFeedForward(d_model, d_ff, dropout) self.transformer_layers = nn.ModuleList([ TransformerDecoderLayer(d_model, heads, d_ff, dropout, self.common_ff) for _ in range(num_layers) ]) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(5000) self.register_buffer('mask', mask)