def __init__(self, h, d_model, attn_p=0.1): super(UniformMultiHeadAttention, self).__init__() self.h = h self.d = d_model assert d_model % h == 0 self.d_head = d_model // h # first attention layer for states self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False)) # second attention for layers #~ self.fc_query_2 = Bottle(Linear(d_model, h*self.d_head, bias=False)) #~ self.fc_key_2 = Bottle(Linear(d_model, h*self.d_head, bias=False)) #~ self.fc_value_2 = Bottle(Linear(d_model, h*self.d_head, bias=False)) # for output self.sm = nn.Softmax(dim=-1) self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False)) #~ self.fc_concat_2 = Bottle(Linear(d_model, d_model, bias=False)) #~ self.attn_dropout = nn.Dropout(attn_p) self.attn_dropout = StaticDropout(attn_p)
def __init__(self, opt, death_rate=0.0, **kwargs): super(EncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_self_attention: self.multihead = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) else: self.multihead = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)
def __init__(self, h, d_model, p, d_ff, position_encoder, time_encoder, attn_p=0.1, version=1.0): super(UniversalDecoderLayer, self).__init__() self.version = version self.position_encoder = position_encoder self.time_encoder = time_encoder self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.constants.static) self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.constants.static) self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.constants.static) self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.constants.static) if onmt.constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.constants.static) elif onmt.constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0, ignore_source=False, variational=False, death_rate=0.0): super(TransformerXLDecoderLayer, self).__init__() self.version = version self.ignore_source = ignore_source self.variational = variational self.death_rate = death_rate self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) if onmt.constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, d_model, dropout_p, sequence='nda', variational=False, elementwise_affine=True, multilingual=False, n_languages=1): super(PrePostProcessing, self).__init__() self.d_model = d_model self.dropout_p = dropout_p self.multilingual = multilingual self.steps = list(sequence) if onmt.constants.residual_type == 'gated': # gated residual # initialize k with one self.k = nn.Parameter(torch.ones(1)) if 'n' in self.steps: if not multilingual: ln = LayerNorm((self.d_model,), elementwise_affine=elementwise_affine) self.layer_norm = Bottle(ln) else: ln = MultilingualLayerNorm((self.d_model,), eps=1e-5, elementwise_affine=True, n_languages=n_languages) self.layer_norm = ln if 'd' in self.steps: if variational: self.dropout = VariationalDropout(self.dropout_p, batch_first=False) else: self.dropout = nn.Dropout(self.dropout_p)
def __init__(self, h, d_model, attn_p=0.1, static=False, share=3): super(MultiHeadAttention, self).__init__() self.h = h self.d = d_model self.share = share assert d_model % h == 0 self.d_head = d_model // h self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False)) self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False)) self.sm = nn.Softmax(dim=-1) if static: self.attn_dropout = StaticDropout(attn_p) else: self.attn_dropout = nn.Dropout(attn_p)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False, death_rate=0.0, max_len=64, **kwargs): super(DistanceTransformerEncoderLayer, self).__init__() self.variational = variational self.death_rate = death_rate self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) # self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2) d_head = d_model // h self.multihead = LearnableRelMultiHeadAttn(h, d_model, d_head, dropatt=attn_p, max_len=max_len) if onmt.constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, variational=self.variational) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, opt, death_rate=0.0): super(DecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 if self.macaron: self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n') self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_self_attention: self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) else: self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not opt.fast_xattention: self.multihead_src = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2) else: self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu)
def __init__(self, opt, death_rate=0.0, **kwargs): super(RelativeTransformerEncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.fast_self_attention: self.multihead = RelPartialLearnableMultiHeadAttn( opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout) else: self.multihead = RelativeSelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) print(opt.fast_feed_forward) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(FCTDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=True) self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing(d_model, p, sequence='da', static=True) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=True) #~ self.multihead_tgt = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead_tgt = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) #~ self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead_src = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) if onmt.constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention # self.lfv_multilingual = opt.lfv_multilingual self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_xattention: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MultiHeadAttention( opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.fast_self_attention: self.multihead_tgt = RelPartialLearnableMultiHeadAttn( opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)