def __init__(self, opt, death_rate=0.0, **kwargs): super(EncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_self_attention: self.multihead = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) else: self.multihead = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False, death_rate=0.0, max_len=64, **kwargs): super(DistanceTransformerEncoderLayer, self).__init__() self.variational = variational self.death_rate = death_rate self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) # self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2) d_head = d_model // h self.multihead = LearnableRelMultiHeadAttn(h, d_model, d_head, dropatt=attn_p, max_len=max_len) if onmt.constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, variational=self.variational) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, opt, death_rate=0.0): super(DecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 if self.macaron: self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n') self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_self_attention: self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) else: self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not opt.fast_xattention: self.multihead_src = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2) else: self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu)
def __init__(self, opt, death_rate=0.0, **kwargs): super(RelativeTransformerEncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.fast_self_attention: self.multihead = RelPartialLearnableMultiHeadAttn( opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout) else: self.multihead = RelativeSelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) print(opt.fast_feed_forward) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)
def __init__(self, opt, death_rate=0.0): self.variational = opt.variational_dropout self.death_rate = death_rate d_model = opt.model_size p = opt.dropout super(ReformerEncoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.self_attention = LSHSelfAttention(opt) self.feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, opt.variational_dropout)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention # self.lfv_multilingual = opt.lfv_multilingual self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_xattention: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MultiHeadAttention( opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.fast_self_attention: self.multihead_tgt = RelPartialLearnableMultiHeadAttn( opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0, ignore_source=False, variational=False, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.version = version self.ignore_source = ignore_source self.variational = variational self.death_rate = death_rate self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing( d_model, p, sequence='da', variational=self.variational) self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, share=1) if onmt.constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p) else: raise NotImplementedError self.feedforward = Bottle(feedforward)