def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0): super(ParallelEncoderLayer, self).__init__() self.version = version self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__( self, h, d_model, p, d_ff, attn_p=0.1, ): super(LMDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1) ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0, ignore_source=False): super(RelativeTransformerDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.ignore_source = ignore_source if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing( d_model, p, sequence='da', static=onmt.Constants.static) self.multihead_src = MultiHeadAttention( h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=2) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = feedforward
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(EncoderLayer, self).__init__() self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da') self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da') self.rnn = nn.LSTM(d_model, d_model//2, 1, bidirectional=True) #~ feedforward = FeedForward(d_model, d_ff, p) self.ffn = FeedForward(d_model, d_ff, p)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(DecoderLayer, self).__init__() self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da') self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da') self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da') self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p) self.rnn = nn.LSTM(d_model, d_model, 1, bidirectional=False) feedforward = FeedForward(d_model, d_ff, p) self.feedforward = feedforward
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(RelativeTransformerEncoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.preprocess_attn_rev = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.d_head = d_head = d_model // h self.multihead_fwd = RelPartialLearnableMultiHeadAttn(h // 2, d_model, d_head, dropatt=attn_p) self.multihead_bwd = RelPartialLearnableMultiHeadAttn(h // 2, d_model, d_head, dropatt=attn_p) self.attn_out = Linear(h * self.d_head, d_model) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = feedforward
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False): super(RelativeTransformerDecoderLayer, self).__init__() self.variational = variational self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.variational = variational self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, variational=self.variational) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, pos_encoder, time_encoder, attn_p=0.1, version=1.0): super(UniversalEncoderLayer, self).__init__() self.version = version # position and time embedding is added into the input before the layer self.pos_encoder = pos_encoder self.time_encoder = time_encoder self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(FCTDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=True) self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing(d_model, p, sequence='da', static=True) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=True) #~ self.multihead_tgt = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead_tgt = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) #~ self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead_src = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)