def __init__(self, d_model, heads, d_ff, dropout, attn_dropout, self_attn_type="scaled-dot", max_relative_positions=0, ctx_weight_param=False): super(TransformerGPTDecoderLayerCtxattn, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attn_dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attn_dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = MLP(d_model, d_model * 4, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-5) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-5) self.context_layer_norm = nn.LayerNorm(d_model, eps=1e-5) self.drop = nn.Dropout(dropout) if ctx_weight_param: print('using ctx_weight_param') self.ctx_weight = Parameter(torch.zeros(1)) self.ctx_weight_param = ctx_weight_param
def __init__(self, opt, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, dict_size=None, label_emb=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions, dict_size=dict_size, label_emb=label_emb, opt=opt) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, tgt_concept_words_type=-1): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.tgt_concept_words_type = tgt_concept_words_type if tgt_concept_words_type in [2]: self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads
class TransformerEncoderLayer(nn.Module): def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): input_norm = self.layer_norm(inputs) context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self") out = self.dropout(context) + inputs return self.feed_forward(out) def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", max_relative_positions=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot"): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__( self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0, pos_ffn_activation_fn=ActivationFunction.relu, ): """ Args: See TransformerDecoderLayerBase """ super(TransformerDecoderLayer, self).__init__( d_model, heads, d_ff, dropout, attention_dropout, self_attn_type, max_relative_positions, aan_useffn, full_context_alignment, alignment_heads, pos_ffn_activation_fn=pos_ffn_activation_fn, ) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.video_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm1 = LayerNorm(d_model) self.layer_norm2 = LayerNorm(d_model) self.drop = nn.Dropout(dropout) self.sublayer = nn.ModuleList( [SublayerConnection(d_model, dropout) for _ in range(3)])
class TransformerEncoderLayer(nn.Module): """ A single layer of the transformer encoder. Args: d_model (int): the dimension of keys/values/queries in MultiHeadedAttention, also the input size of the first-layer of the PositionwiseFeedForward. heads (int): the number of head for MultiHeadedAttention. d_ff (int): the second-layer of the PositionwiseFeedForward. dropout (float): dropout probability(0-1.0). pos_ffn_activation_fn (ActivationFunction): activation function choice for PositionwiseFeedForward layer """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, pos_ffn_activation_fn=ActivationFunction.relu): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): """ Args: inputs (FloatTensor): ``(batch_size, src_len, model_dim)`` mask (LongTensor): ``(batch_size, 1, src_len)`` Returns: (FloatTensor): * outputs ``(batch_size, src_len, model_dim)`` """ input_norm = self.layer_norm(inputs) context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self") out = self.dropout(context) + inputs return self.feed_forward(out) def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, strided_attn=False, conv_k_v=False): super(TransformerEncoderLayer, self).__init__() self.strided_attn = strided_attn self.conv_k_v = conv_k_v if self.strided_attn: self.self_attn = MultiHeadedStridedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) else: self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.conv1d_k_v = nn.Conv1d(d_model, d_model, kernel_size=3, stride=3) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, downsampling=1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.ds_layer = nn.Linear(d_model, int( d_model / downsampling)) if downsampling > 1 else None self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attn_dropout, max_relative_positions=0): super(TransformerGPTEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attn_dropout, max_relative_positions=max_relative_positions) self.feed_forward = MLP(d_model, d_model*4, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-5) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-5) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, activation='relu', is_bert=False): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, activation) self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attn_dropout, self_attn_type="scaled-dot", max_relative_positions=0): super(TransformerGPTUnconditionalDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attn_dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attn_dropout) self.feed_forward = MLP(d_model, d_model * 4, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-5) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-5) self.drop = nn.Dropout(dropout)
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads def forward(self, *args, **kwargs): """ Extend _forward for (possibly) multiple decoder pass: 1. Always a default (future masked) decoder forward pass, 2. Possibly a second future aware decoder pass for joint learn full context alignement. Args: * All arguments of _forward. with_align (bool): whether return alignment attention. Returns: (FloatTensor, FloatTensor, FloatTensor or None): * output ``(batch_size, 1, model_dim)`` * top_attn ``(batch_size, 1, src_len)`` * attn_align ``(batch_size, 1, src_len)`` or None """ with_align = kwargs.pop('with_align', False) output, attns = self._forward(*args, **kwargs) top_attn = attns[:, 0, :, :].contiguous() attn_align = None if with_align: if self.full_context_alignment: # return _, (B, Q_len, K_len) _, attns = self._forward(*args, **kwargs, future=True) if self.alignment_heads is not None: attns = attns[:, :self.alignment_heads, :, :].contiguous() # layer average attention across heads, get ``(B, Q, K)`` # Case 1: no full_context, no align heads -> layer avg baseline # Case 2: no full_context, 1 align heads -> guided align # Case 3: full_context, 1 align heads -> full cte guided align attn_align = attns.mean(dim=1) return output, top_attn, attn_align def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, future=False): """ A naive forward pass for transformer decoder. # TODO: change 1 to T as T could be 1 or tgt_len Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attns ``(batch_size, head, 1, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) if not future: # apply future_mask, result mask in (B, T, T) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) # BoolTensor was introduced in pytorch 1.2 try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) else: # only mask padding, result mask in (B, 1, T) dec_mask = tgt_pad_mask input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, _ = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) elif isinstance(self.self_attn, MultiHeadedCausalAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self", decoder=True) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attns = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, attns def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.context_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerDecoderLayer(TransformerDecoderLayerBase): """Transformer Decoder layer block in Pre-Norm style. Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style, providing better converge speed and performance. This is also the actual implementation in tensor2tensor and also avalable in fairseq. See https://tunz.kr/post/4 and :cite:`DeeperTransformer`. .. mermaid:: graph LR %% "*SubLayer" can be self-attn, src-attn or feed forward block A(input) --> B[Norm] B --> C["*SubLayer"] C --> D[Drop] D --> E((+)) A --> E E --> F(out) """ def __init__( self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0, pos_ffn_activation_fn=ActivationFunction.relu, ): """ Args: See TransformerDecoderLayerBase """ super(TransformerDecoderLayer, self).__init__( d_model, heads, d_ff, dropout, attention_dropout, self_attn_type, max_relative_positions, aan_useffn, full_context_alignment, alignment_heads, pos_ffn_activation_fn=pos_ffn_activation_fn, ) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) def update_dropout(self, dropout, attention_dropout): super(TransformerDecoderLayer, self).update_dropout(dropout, attention_dropout) self.context_attn.update_dropout(attention_dropout) def _forward( self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, future=False, ): """A naive forward pass for transformer decoder. # T: could be 1 in the case of stepwise decoding or tgt_len Args: inputs (FloatTensor): ``(batch_size, T, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (bool): ``(batch_size, 1, src_len)`` tgt_pad_mask (bool): ``(batch_size, 1, T)`` layer_cache (dict or None): cached layer info when stepwise decode step (int or None): stepwise decoding counter future (bool): If set True, do not apply future_mask. Returns: (FloatTensor, FloatTensor): * output ``(batch_size, T, model_dim)`` * attns ``(batch_size, head, T, src_len)`` """ dec_mask = None if inputs.size(1) > 1: # masking is necessary when sequence length is greater than one dec_mask = self._compute_dec_mask(tgt_pad_mask, future) inputs_norm = self.layer_norm_1(inputs) query, _ = self._forward_self_attn(inputs_norm, dec_mask, layer_cache, step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attns = self.context_attn( memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context", ) output = self.feed_forward(self.drop(mid) + query) return output, attns
def __init__( self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0, pos_ffn_activation_fn=ActivationFunction.relu, ): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout in residual, self-attn(dot) and feed-forward attention_dropout (float): dropout in context_attn (and self-attn(avg)) self_attn_type (string): type of self-attention scaled-dot, average max_relative_positions (int): Max distance between inputs in relative positions representations aan_useffn (bool): Turn on the FFN layer in the AAN decoder full_context_alignment (bool): whether enable an extra full context decoder forward for alignment alignment_heads (int): N. of cross attention heads to use for alignment guiding pos_ffn_activation_fn (ActivationFunction): activation function choice for PositionwiseFeedForward layer """ super(TransformerDecoderLayerBase, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions, ) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, num_boost=4, learnable_weights=True, boost_type='continuous', main_stream=False, boost_drop_rate=0.1, boost_dropout_diff=0.0,boost_with_ffn=False, boost_str='', boost_gating=False, mask_pos_type=[], self_att_merge_layer=False, adv_bias_step=0.0, shuffle_merge=False, shuffle_merge_type="sum", adv_gradient_boost=False, adv_gradient_boost_step=0.01, adv_gradient_boost_func='mse', adv_gradient_boost_no_ce=False, gradient_boost_scale=1.0, boost_adv_method_list=[], boost_sample_rate=1.0, shuffle_fix=0, boost_single_att=False, boost_single_ffn=False, shuffle_stop_gradient=False): super(TransformerEncoderBoostLayer, self).__init__() self.num_boost = num_boost self.boost_type = boost_type self.main_stream = main_stream self.boost_drop_rate = boost_drop_rate self.boost_with_ffn = boost_with_ffn self.use_adv = True if self.boost_type == 'adv' else False self.a_num = num_boost # self.use_dropout_diff = True if boost_dropout_diff != 0.0 else False self.use_dropout_diff = False self.d_num = num_boost self.use_mask = True if self.boost_type in {'continuous', 'continuous_comp', 'random', 'pos'} else False # overwrite params based on boost_str self.boost_gating = boost_gating self.mask_pos_type = mask_pos_type # init postag params self.use_postag = False self.p_num = 0 self._parse_boost_str(boost_str) # whether to use self-att to merge each path's output self.use_self_att_merge_layer = self_att_merge_layer self.adv_bias_step = adv_bias_step self.shuffle_merge = shuffle_merge self.shuffle_merge_type = shuffle_merge_type self.adv_gradient_boost = adv_gradient_boost self.adv_gradient_boost_step = adv_gradient_boost_step self.adv_gradient_boost_func = adv_gradient_boost_func self.adv_gradient_boost_no_ce = adv_gradient_boost_no_ce self.gradient_boost_scale = gradient_boost_scale self.boost_sample_rate = boost_sample_rate self.boost_single_att = boost_single_att self.boost_single_ffn = boost_single_ffn self.shuffle_stop_gradient = shuffle_stop_gradient # compute dropout list if not self.use_dropout_diff: dropout_list = [dropout for i in range(self.num_boost)] else: dropout_diffs = [boost_dropout_diff * i - float(self.d_num)/2 * boost_dropout_diff for i in range(self.d_num)] dropout_list = [dropout + dropout_diffs[i] for i in range(self.d_num)] + [dropout for i in range(self.num_boost - self.d_num)] self.dropout_list = dropout_list print("Boost dropout list: {}".format(dropout_list)) assert max(dropout_list) <= 1.0 and min(dropout_list) >= 0.0 # list of self-attention module if not self.boost_single_att: self.self_attn_list = [ MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) for n in range(self.num_boost) ] self.self_attn_list = nn.ModuleList(self.self_attn_list) else: self.self_attn_list = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) # assert self.d_num == self.num_boost if self.main_stream: # main stream for self-attention self.main_self_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) if self.use_self_att_merge_layer: # keep the default setting for self-attention layer. self.att_merge_layer = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.merge_layer_norm = nn.LayerNorm(d_model, eps=1e-6) # convert all ones to 1/N weights_init = torch.ones(self.num_boost, dtype=torch.float32) / self.num_boost self.weights = nn.Parameter(weights_init, requires_grad=learnable_weights) if self.boost_with_ffn: if not self.boost_single_ffn: feed_forward_list = [ PositionwiseFeedForward(d_model, d_ff, dropout_list[i]) for i in range(self.num_boost) ] self.feed_forward = nn.ModuleList(feed_forward_list) else: self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_list[0]) else: self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) # TODO: Functions for drop_rate is not implemented yet. self.shuffle_fix = shuffle_fix if self.shuffle_merge: if shuffle_fix == 0: shuffle_matrix = torch.abs(torch.randn(self.num_boost, self.num_boost)) else: shuffle_matrix = torch.ones(self.num_boost, self.num_boost) / self.num_boost self.shuffle_matrix = nn.Parameter(shuffle_matrix) self.merge_weights = torch.ones((self.num_boost-1,), dtype=torch.float32, requires_grad=False) if self.use_adv or self.use_postag is True: # permutation of max position range. self.max_perm = 3 self.max_exchange = 3 if not boost_adv_method_list: all_adv_methods = ['swap', 'reorder', 'delete', 'mask'] else: all_adv_methods = boost_adv_method_list assert self.a_num <= len(all_adv_methods) self.activate_methods = all_adv_methods[:self.a_num] # create mask tensor if "mask" in self.activate_methods or self.use_postag is True: mask_tensor = torch.empty(d_model) torch.nn.init.normal_(mask_tensor, std=1.0/math.sqrt(d_model)) self.mask_tensor = nn.Parameter(mask_tensor) print('Activated adversarial methods: {}'.format(self.activate_methods)) if self.use_postag: assert len(self.mask_pos_type) == self.p_num if self.adv_gradient_boost is True: if adv_gradient_boost_func == 'mse': self.mse = nn.MSELoss(reduction='none') elif adv_gradient_boost_func == 'cos': self.cos_sim = nn.CosineSimilarity(dim=2) elif adv_gradient_boost_func == 'l1': self.l1 = nn.L1Loss(reduction='none') else: raise ValueError() self.keep_adv_gradient = False self.adv_gradient_value = 'moving_average' if self.keep_adv_gradient: self.register_buffer('gradient_moving_average', ) self.keep_ffn_dist = [] self.keep_attn_dist = [] self.keep_attn_out_dist = [] self.keep_attn_score = [] return
class TransformerDecoderLayer(nn.Module): """Transformer Decoder layer block in Pre-Norm style. Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style, providing better converge speed and performance. This is also the actual implementation in tensor2tensor and also avalable in fairseq. See https://tunz.kr/post/4 and :cite:`DeeperTransformer`. .. mermaid:: graph LR %% "*SubLayer" can be self-attn, src-attn or feed forward block A(input) --> B[Norm] B --> C["*SubLayer"] C --> D[Drop] D --> E((+)) A --> E E --> F(out) Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout in residual, self-attn(dot) and feed-forward attention_dropout (float): dropout in context_attn (and self-attn(avg)) self_attn_type (string): type of self-attention scaled-dot, average max_relative_positions (int): Max distance between inputs in relative positions representations aan_useffn (bool): Turn on the FFN layer in the AAN decoder full_context_alignment (bool): whether enable an extra full context decoder forward for alignment alignment_heads (int): N. of cross attention heads to use for alignment guiding """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads def forward(self, *args, **kwargs): """ Extend `_forward` for (possibly) multiple decoder pass: Always a default (future masked) decoder forward pass, Possibly a second future aware decoder pass for joint learn full context alignement, :cite:`garg2019jointly`. Args: * All arguments of _forward. with_align (bool): whether return alignment attention. Returns: (FloatTensor, FloatTensor, FloatTensor or None): * output ``(batch_size, T, model_dim)`` * top_attn ``(batch_size, T, src_len)`` * attn_align ``(batch_size, T, src_len)`` or None """ with_align = kwargs.pop('with_align', False) output, attns = self._forward(*args, **kwargs) top_attn = attns[:, 0, :, :].contiguous() attn_align = None if with_align: if self.full_context_alignment: # return _, (B, Q_len, K_len) _, attns = self._forward(*args, **kwargs, future=True) if self.alignment_heads > 0: attns = attns[:, :self.alignment_heads, :, :].contiguous() # layer average attention across heads, get ``(B, Q, K)`` # Case 1: no full_context, no align heads -> layer avg baseline # Case 2: no full_context, 1 align heads -> guided align # Case 3: full_context, 1 align heads -> full cte guided align attn_align = attns.mean(dim=1) return output, top_attn, attn_align def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, future=False): """ A naive forward pass for transformer decoder. # T: could be 1 in the case of stepwise decoding or tgt_len Args: inputs (FloatTensor): ``(batch_size, T, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, T)`` layer_cache (dict or None): cached layer info when stepwise decode step (int or None): stepwise decoding counter future (bool): If set True, do not apply future_mask. Returns: (FloatTensor, FloatTensor): * output ``(batch_size, T, model_dim)`` * attns ``(batch_size, head, T, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) if not future: # apply future_mask, result mask in (B, T, T) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) # BoolTensor was introduced in pytorch 1.2 try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) else: # only mask padding, result mask in (B, 1, T) dec_mask = tgt_pad_mask input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, _ = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attns = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, attns def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.context_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerEncoderLayer(nn.Module): """ A single layer of the transformer encoder. Args: d_model (int): the dimension of keys/values/queries in MultiHeadedAttention, also the input size of the first-layer of the PositionwiseFeedForward. heads (int): the number of head for MultiHeadedAttention. d_ff (int): the second-layer of the PositionwiseFeedForward. dropout (float): dropout probability(0-1.0). """ def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.self_attn2 = SelfMultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm2 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): """ Args: inputs (FloatTensor): ``(batch_size, src_len, model_dim)`` mask (LongTensor): ``(batch_size, src_len, src_len)`` Returns: (FloatTensor): * outputs ``(batch_size, src_len, model_dim)`` """ input_norm = self.layer_norm(inputs) context, attn = self.self_attn(input_norm, input_norm, input_norm, mask=mask, type="self") out = self.dropout(context) + inputs # batch_size, head_count, query_len, key_len = attn.size() # file=open('txt','a') # import numpy # for b in range(batch_size): # for h in range(head_count): # for q in range(query_len): # file.write(str(list(numpy.array(attn.cpu()[b,h,q,:])))+'\n') # file.write('\n') return self.feed_forward(out) def update_dropout(self, dropout): self.self_attn.update_dropout(dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
class TransformerDecoderLayer(nn.Module): def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads def forward(self, *args, **kwargs): with_align = kwargs.pop('with_align', False) output, attns = self._forward(*args, **kwargs) top_attn = attns[:, 0, :, :].contiguous() attn_align = None if with_align: if self.full_context_alignment: # return _, (B, Q_len, K_len) _, attns = self._forward(*args, **kwargs, future=True) if self.alignment_heads > 0: attns = attns[:, :self.alignment_heads, :, :].contiguous() attn_align = attns.mean(dim=1) return output, top_attn, attn_align def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, future=False): dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) if not future: future_mask = torch.ones( [tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) else: dec_mask = tgt_pad_mask input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, _ = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attns = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, attns def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.context_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, code_type="enc", layer_cache=None, step=None): """ Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attn ``(batch_size, 1, src_len)`` """ dec_mask = None if step is None: if code_type == 'dec': tgt_len = tgt_pad_mask.size(-1) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) else: dec_mask = tgt_pad_mask input_norm = self.layer_norm_1(inputs) query, attn = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self") query = self.drop(query) + inputs if code_type == 'dec': query_norm = self.layer_norm_2(query) mid, attn = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context") mid = self.drop(mid) + query else: mid = query output = self.feed_forward(mid) # batch_size, head_count, query_len, key_len = attn.size() # file=open('txt','a') # import numpy # for b in range(batch_size): # for h in range(head_count): # for q in range(query_len): # file.write(str(list(numpy.array(attn.cpu()[b,h,q,:])))+'\n') # file.write('\n') return output def update_dropout(self, dropout): self.self_attn.update_dropout(dropout) self.context_attn.update_dropout(dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerEncoderLayer(nn.Module): """ A single layer of the transformer encoder. Args: d_model (int): the dimension of keys/values/queries in MultiHeadedAttention, also the input size of the first-layer of the PositionwiseFeedForward. heads (int): the number of head for MultiHeadedAttention. d_ff (int): the second-layer of the PositionwiseFeedForward. dropout (float): dropout probability(0-1.0). activation (str): activation function to chose from ['relu', 'gelu'] is_bert (bool): default False. When set True, layer_norm will be performed on the direct connection of residual block. """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, activation='relu', is_bert=False): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, activation) self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): """ Args: inputs (FloatTensor): ``(batch_size, src_len, model_dim)`` mask (LongTensor): ``(batch_size, 1, src_len)`` Returns: (FloatTensor): * outputs ``(batch_size, src_len, model_dim)`` """ # Embedding -> [ LayerNorm -> Self Attention -> LayerNorm -> Position-wise FeedForward ] input_norm = self.layer_norm(inputs) context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self") out = self.dropout(context) + input_norm return self.feed_forward(out) def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, tgt_concept_words_type=-1): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.tgt_concept_words_type = tgt_concept_words_type if tgt_concept_words_type in [2]: self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model) def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, tgt_concept_words_emb=None, tgt_concept_words_type=-1): """ Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attn ``(batch_size, 1, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) # BoolTensor was introduced in pytorch 1.2 try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, attn = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, attn = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs # ablation if tgt_concept_words_emb is not None: # print(query.shape, tgt_concept_words_emb.shape) if self.tgt_concept_words_type == 2: query = self.tgt_concept_mlp( torch.cat([query, tgt_concept_words_emb], dim=2)) if self.tgt_concept_words_type == 3: query = (query + tgt_concept_words_emb) / 2 query_norm = self.layer_norm_2(query) mid, attn = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, attn def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.context_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerEncoderLayer(nn.Module): """ A single layer of the transformer encoder. Args: d_model (int): the dimension of keys/values/queries in MultiHeadedAttention, also the input size of the first-layer of the PositionwiseFeedForward. heads (int): the number of head for MultiHeadedAttention. d_ff (int): the second-layer of the PositionwiseFeedForward. dropout (float): dropout probability(0-1.0). """ def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, downsampling=1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.ds_layer = nn.Linear(d_model, int( d_model / downsampling)) if downsampling > 1 else None self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): """ Args: inputs (FloatTensor): ``(batch_size, src_len, model_dim)`` mask (LongTensor): ``(batch_size, src_len, src_len)`` Returns: (FloatTensor): * outputs ``(batch_size, src_len, model_dim)`` """ #import pdb;pdb.set_trace() b, l, d = inputs.size() input_norm = self.layer_norm(inputs) context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self") out = self.dropout(context) + inputs out = self.feed_forward(out) out = self.ds_layer(out).view(b, -1, d) if self.ds_layer is not None else out return out def update_dropout(self, dropout): self.self_attn.update_dropout(dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, opt, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, dict_size=None, label_emb=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions, dict_size=dict_size, label_emb=label_emb, opt=opt) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, gold_par_attn=None, gold_ch_attn=None): """ Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attn ``(batch_size, 1, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) #future_mask = future_mask.triu_(0).view(1, tgt_len, tgt_len) #future_mask[0,0,0]=0 # BoolTensor was introduced in pytorch 1.2 try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) # elif step!=0 and synsa: # self_mask = torch.zeros( # [1,1, step+1], # device=tgt_pad_mask.device, # dtype=torch.uint8) # self_mask[:,:,-1]=1 # try: # self_mask = self_mask.bool() # except AttributeError: # pass # dec_mask = torch.gt(self_mask, 0) input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, tgt_attn, second_attn, ch_labels, par_labels = self.self_attn( input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self", gold_par_attn=gold_par_attn, gold_ch_attn=gold_ch_attn) elif isinstance(self.self_attn, AverageAttention): query, attn = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, src_attn, _, _, _ = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, src_attn, tgt_attn, second_attn, dec_mask, ch_labels, par_labels def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.context_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", max_relative_positions=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None): """ Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attn ``(batch_size, 1, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) future_mask = torch.ones( [tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, attn = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, attn = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) context, attn = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(context) + query) return output, attn, context def update_dropout(self, dropout): self.self_attn.update_dropout(dropout) self.context_attn.update_dropout(dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
def __init__(self, num_layers, size, heads, embeddings): super(FeedForwardEncoder, self).__init__() self.embeddings = embeddings self.size = size self.encode_layers = FeedForwardEncoderLayers(embeddings.embedding_size, size, num_layers) self.attention = MultiHeadedAttention(heads, embeddings.embedding_size)