Exemple #1
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0):
        super(ParallelEncoderLayer, self).__init__()
        self.version = version

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Exemple #2
0
    def __init__(
        self,
        h,
        d_model,
        p,
        d_ff,
        attn_p=0.1,
    ):
        super(LMDecoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.multihead_tgt = MultiHeadAttention(h,
                                                d_model,
                                                attn_p=attn_p,
                                                static=onmt.Constants.static,
                                                share=1)

        ff_p = p
        feedforward = FeedForward(d_model,
                                  d_ff,
                                  ff_p,
                                  static=onmt.Constants.static)
        self.feedforward = Bottle(feedforward)
Exemple #3
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 version=1.0,
                 ignore_source=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.ignore_source = ignore_source

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(d_model,
                                                         p,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                d_model, p, sequence='da', static=onmt.Constants.static)
            self.multihead_src = MultiHeadAttention(
                h,
                d_model,
                attn_p=attn_p,
                static=onmt.Constants.static,
                share=2)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1)
        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = feedforward
Exemple #4
0
 def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
     
     super(EncoderLayer, self).__init__()
     self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n')
     
     self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da')
     
     
     self.rnn = nn.LSTM(d_model, d_model//2, 1, bidirectional=True)
     
     #~ feedforward = FeedForward(d_model, d_ff, p)
     self.ffn = FeedForward(d_model, d_ff, p)
Exemple #5
0
 def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
     
     super(DecoderLayer, self).__init__()
     self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n')
     
     self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da')
     
     
     self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p)
     self.rnn = nn.LSTM(d_model, d_model, 1, bidirectional=False)
     feedforward = FeedForward(d_model, d_ff, p)
     self.feedforward = feedforward  
Exemple #6
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(RelativeTransformerEncoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.preprocess_attn_rev = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.d_head = d_head = d_model // h
        self.multihead_fwd = RelPartialLearnableMultiHeadAttn(h // 2,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        self.multihead_bwd = RelPartialLearnableMultiHeadAttn(h // 2,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        self.attn_out = Linear(h * self.d_head, d_model)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = feedforward
Exemple #7
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.variational = variational
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.variational = variational

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)
Exemple #8
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 pos_encoder,
                 time_encoder,
                 attn_p=0.1,
                 version=1.0):
        super(UniversalEncoderLayer, self).__init__()
        self.version = version
        # position and time embedding is added into the input before the layer
        self.pos_encoder = pos_encoder
        self.time_encoder = time_encoder

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Exemple #9
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(FCTDecoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=True)

        self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_src_attn = PrePostProcessing(d_model,
                                                      p,
                                                      sequence='da',
                                                      static=True)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=True)

        #~ self.multihead_tgt = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead_tgt = UniformMultiHeadAttention(h,
                                                       d_model,
                                                       attn_p=attn_p)
        #~ self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead_src = UniformMultiHeadAttention(h,
                                                       d_model,
                                                       attn_p=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)