def __init__(self, h, d_model, attn_p=0.1): super(UniformMultiHeadAttention, self).__init__() self.h = h self.d = d_model assert d_model % h == 0 self.d_head = d_model // h # first attention layer for states self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False)) # second attention for layers #~ self.fc_query_2 = Bottle(Linear(d_model, h*self.d_head, bias=False)) #~ self.fc_key_2 = Bottle(Linear(d_model, h*self.d_head, bias=False)) #~ self.fc_value_2 = Bottle(Linear(d_model, h*self.d_head, bias=False)) # for output self.sm = nn.Softmax(dim=-1) self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False)) #~ self.fc_concat_2 = Bottle(Linear(d_model, d_model, bias=False)) #~ self.attn_dropout = nn.Dropout(attn_p) self.attn_dropout = StaticDropout(attn_p)
def __init__(self, h, d_model, attn_p=0.1, static=True, share=3): super(MultiHeadAttention, self).__init__() self.h = h self.d = d_model self.share = share assert d_model % h == 0 self.d_head = d_model // h #D.S: d_head = d_v, d_k #D.S. fc_query is fully conntected layer to produce the Linear combination of W_q * x_i = q_i for given word embedding x_i self.fc_query = Bottle( Linear(d_model, h * self.d_head, bias=False) ) #D.S: Bottle (Mask for skipping unnecesarry computations) self.fc_key = Bottle( Linear(d_model, h * self.d_head, bias=False) ) #D.S. Params Linear(d_in, d_out, bias=True, nonlinearity='linear'): self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.attention_out = onmt.Constants.attention_out #TODO: Constant not existing?? #D.S: Concat all outputs of heads to output of size d_model which is the output of encoder/decoder sublayer self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False)) self.sm = nn.Softmax(dim=-1) #D.S: Apply softmax on last dimension if static: self.attn_dropout = StaticDropout(attn_p) else: self.attn_dropout = nn.Dropout(attn_p)
def __init__(self, h, d_model, attn_p=0.1, static=True, share=3, limit_rhs_steps=None): super(MultiHeadAttention, self).__init__() self.h = h self.d = d_model self.share = share assert d_model % h == 0 self.d_head = d_model // h self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False)) self.sm = nn.Softmax(dim=-1) if static: self.attn_dropout = StaticDropout(attn_p) else: self.attn_dropout = nn.Dropout(attn_p) self.limit_rhs_steps = limit_rhs_steps
def __init__(self, d_model, dropout_p, sequence='nda', variational=False, elementwise_affine=True): super(PrePostProcessing, self).__init__() self.d_model = d_model self.dropout_p = dropout_p self.steps = list(sequence) if onmt.Constants.residual_type == 'gated': # gated residual # initialize k with one self.k = nn.Parameter(torch.ones(1)) if 'n' in self.steps: ln = nn.LayerNorm((self.d_model, ), elementwise_affine=elementwise_affine) self.layer_norm = Bottle(ln) if 'd' in self.steps: if variational: self.dropout = VariationalDropout(self.dropout_p, batch_first=False) else: self.dropout = nn.Dropout(self.dropout_p)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0): super(EncoderLayer, self).__init__() self.version = version self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, d_model, dropout_p, sequence='nda', static=True, elementwise_affine=True): super(PrePostProcessing, self).__init__() self.d_model = d_model self.dropout_p = dropout_p self.steps = list(sequence) if onmt.Constants.residual_type == 'gated': # gated residual # initialize k with one self.k = nn.Parameter(torch.ones(1)) if 'n' in self.steps: ln = nn.LayerNorm((self.d_model, ), elementwise_affine=elementwise_affine) #~ ln.weight.data.fill_(1) self.layer_norm = Bottle(ln) if 'd' in self.steps: if static: self.dropout = StaticDropout(self.dropout_p) else: self.dropout = nn.Dropout(self.dropout_p, inplace=False)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0, ignore_source=False): super(DecoderLayer, self).__init__() self.version = version self.ignore_source = ignore_source self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing( d_model, p, sequence='da', static=onmt.Constants.static) self.multihead_src = MultiHeadAttention( h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=2) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, attn_p=0.1, static=True): super(MultiHeadAttention, self).__init__() self.h = h self.d = d_model assert d_model % h == 0 self.d_head = d_model // h self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.attention_out = onmt.Constants.attention_out self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False)) self.sm = nn.Softmax(dim=-1) if static: self.attn_dropout = StaticDropout(attn_p) else: self.attn_dropout = nn.Dropout(attn_p)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, residual_p=0.1, version=1.0): super(DecoderLayer, self).__init__() self.version = version self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, residual_p, sequence='da', static=onmt.Constants.static) self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing( d_model, residual_p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, residual_p, sequence='da', static=onmt.Constants.static) self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1) #D.S: Weight sharing between query, key and value self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=2) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False, **kwargs): super(EncoderLayer, self).__init__() self.variational = variational self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, variational=self.variational) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False): super(RelativeTransformerDecoderLayer, self).__init__() self.variational = variational self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.variational = variational self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, variational=self.variational) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, pos_encoder, time_encoder, attn_p=0.1, version=1.0): super(UniversalEncoderLayer, self).__init__() self.version = version # position and time embedding is added into the input before the layer self.pos_encoder = pos_encoder self.time_encoder = time_encoder self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(FCTDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=True) self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing(d_model, p, sequence='da', static=True) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=True) #~ self.multihead_tgt = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead_tgt = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) #~ self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead_src = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)