Example #1
0
    def __init__(self, opt, death_rate=0.0, **kwargs):
        super(EncoderLayer, self).__init__()
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention

        self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                 variational=self.variational)

        if opt.fast_self_attention:
            self.multihead = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
        else:
            self.multihead = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1)

        if not opt.fast_feed_forward:

            feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                       variational=self.variational)
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 variational=False,
                 death_rate=0.0,
                 max_len=64,
                 **kwargs):
        super(DistanceTransformerEncoderLayer, self).__init__()
        self.variational = variational
        self.death_rate = death_rate

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)
        # self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2)
        d_head = d_model // h
        self.multihead = LearnableRelMultiHeadAttn(h,
                                                   d_model,
                                                   d_head,
                                                   dropatt=attn_p,
                                                   max_len=max_len)

        if onmt.constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError

        self.feedforward = Bottle(feedforward)
Example #3
0
    def __init__(self, opt, death_rate=0.0):
        super(DecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1

        if self.macaron:
            self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n')
            self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout,
                                                         sequence='da', variational=self.variational)

            self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                           variational=self.variational,
                                                           activation=opt.ffn_activation, glu=opt.ffn_glu)

        self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                  variational=self.variational)

        if opt.fast_self_attention:
            self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
        else:
            self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
            self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                          variational=self.variational)

            if not opt.fast_xattention:
                self.multihead_src = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2)
            else:
                self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout)

        self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                 variational=self.variational)

        if not opt.fast_feed_forward:

            feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                       variational=self.variational,
                                                       activation=opt.ffn_activation, glu=opt.ffn_glu)
Example #4
0
    def __init__(self, opt, death_rate=0.0, **kwargs):
        super(RelativeTransformerEncoderLayer, self).__init__()
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)
        d_head = opt.model_size // opt.n_heads
        if not self.fast_self_attention:
            self.multihead = RelPartialLearnableMultiHeadAttn(
                opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout)
        else:
            self.multihead = RelativeSelfMultiheadAttn(opt.model_size,
                                                       opt.n_heads,
                                                       opt.attn_dropout)

        print(opt.fast_feed_forward)
        if not opt.fast_feed_forward:
            feedforward = FeedForward(opt.model_size,
                                      opt.inner_size,
                                      opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)
Example #5
0
    def __init__(self, opt, death_rate=0.0):
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        d_model = opt.model_size
        p = opt.dropout

        super(ReformerEncoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)

        self.self_attention = LSHSelfAttention(opt)
        self.feedforward = FeedForward(opt.model_size, opt.inner_size,
                                       opt.dropout, opt.variational_dropout)
Example #6
0
    def __init__(self, opt, death_rate=0.0):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        # self.lfv_multilingual = opt.lfv_multilingual

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size,
                                                         opt.dropout,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)

            if opt.fast_xattention:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)
            else:
                self.multihead_src = MultiHeadAttention(
                    opt.n_heads,
                    opt.model_size,
                    attn_p=opt.attn_dropout,
                    share=2)

        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if not self.fast_self_attention:
            self.multihead_tgt = RelPartialLearnableMultiHeadAttn(
                opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout)
        else:
            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size, opt.n_heads, opt.attn_dropout)

        if not opt.fast_feed_forward:
            feedforward = FeedForward(opt.model_size,
                                      opt.inner_size,
                                      opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)
Example #7
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 version=1.0,
                 ignore_source=False,
                 variational=False,
                 death_rate=0.0):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.version = version
        self.ignore_source = ignore_source
        self.variational = variational
        self.death_rate = death_rate

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(d_model,
                                                         p,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                d_model, p, sequence='da', variational=self.variational)
            self.multihead_src = MultiHeadAttention(h,
                                                    d_model,
                                                    attn_p=attn_p,
                                                    share=2)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, share=1)

        if onmt.constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model, d_ff, ff_p)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)