Ejemplo n.º 1
0
    def __init__(self, h, d_model, attn_p=0.1):
        super(UniformMultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model

        assert d_model % h == 0

        self.d_head = d_model // h

        # first attention layer for states
        self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False))

        # second attention for layers
        #~ self.fc_query_2 = Bottle(Linear(d_model, h*self.d_head, bias=False))
        #~ self.fc_key_2 = Bottle(Linear(d_model, h*self.d_head, bias=False))
        #~ self.fc_value_2 = Bottle(Linear(d_model, h*self.d_head, bias=False))

        # for output
        self.sm = nn.Softmax(dim=-1)
        self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False))
        #~ self.fc_concat_2 = Bottle(Linear(d_model, d_model, bias=False))

        #~ self.attn_dropout = nn.Dropout(attn_p)

        self.attn_dropout = StaticDropout(attn_p)
Ejemplo n.º 2
0
    def __init__(self, opt, death_rate=0.0, **kwargs):
        super(EncoderLayer, self).__init__()
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention

        self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                 variational=self.variational)

        if opt.fast_self_attention:
            self.multihead = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
        else:
            self.multihead = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1)

        if not opt.fast_feed_forward:

            feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                       variational=self.variational)
Ejemplo n.º 3
0
 def __init__(self, h, d_model, p, d_ff, position_encoder, time_encoder, attn_p=0.1, version=1.0):
     super(UniversalDecoderLayer, self).__init__()
     self.version = version
     self.position_encoder = position_encoder
     self.time_encoder = time_encoder
     
     self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.constants.static)
     
     self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_src_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.constants.static)
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.constants.static)
     
     
     self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.constants.static)
     self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.constants.static)
     
     if onmt.constants.activation_layer == 'linear_relu_linear':
         ff_p = p
         feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.constants.static)
     elif onmt.constants.activation_layer == 'maxout':
         k = int(math.ceil(d_ff / d_model))
         feedforward = MaxOut(d_model, d_model, k)
     self.feedforward = Bottle(feedforward)
Ejemplo n.º 4
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0, ignore_source=False,
                 variational=False, death_rate=0.0):
        super(TransformerXLDecoderLayer, self).__init__()
        self.version = version
        self.ignore_source = ignore_source
        self.variational = variational
        self.death_rate = death_rate

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational)

        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p)

        if onmt.constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational)
        elif onmt.constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model, d_ff, ff_p)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)
Ejemplo n.º 5
0
    def __init__(self, d_model, dropout_p, sequence='nda', variational=False, elementwise_affine=True,
                 multilingual=False, n_languages=1):
        super(PrePostProcessing, self).__init__()
        self.d_model = d_model
        self.dropout_p = dropout_p
        self.multilingual = multilingual

        self.steps = list(sequence)

        if onmt.constants.residual_type == 'gated':
            # gated residual
            # initialize k with one
            self.k = nn.Parameter(torch.ones(1))

        if 'n' in self.steps:
            if not multilingual:
                ln = LayerNorm((self.d_model,), elementwise_affine=elementwise_affine)
                self.layer_norm = Bottle(ln)
            else:
                ln = MultilingualLayerNorm((self.d_model,), eps=1e-5, elementwise_affine=True, n_languages=n_languages)
                self.layer_norm = ln
        if 'd' in self.steps:
            if variational:
                self.dropout = VariationalDropout(self.dropout_p, batch_first=False)
            else:
                self.dropout = nn.Dropout(self.dropout_p)
Ejemplo n.º 6
0
    def __init__(self, h, d_model, attn_p=0.1, static=False, share=3):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model
        self.share = share

        assert d_model % h == 0

        self.d_head = d_model // h
        self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False))

        self.sm = nn.Softmax(dim=-1)

        if static:
            self.attn_dropout = StaticDropout(attn_p)
        else:
            self.attn_dropout = nn.Dropout(attn_p)
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 variational=False,
                 death_rate=0.0,
                 max_len=64,
                 **kwargs):
        super(DistanceTransformerEncoderLayer, self).__init__()
        self.variational = variational
        self.death_rate = death_rate

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)
        # self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2)
        d_head = d_model // h
        self.multihead = LearnableRelMultiHeadAttn(h,
                                                   d_model,
                                                   d_head,
                                                   dropatt=attn_p,
                                                   max_len=max_len)

        if onmt.constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError

        self.feedforward = Bottle(feedforward)
Ejemplo n.º 8
0
    def __init__(self, opt, death_rate=0.0):
        super(DecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1

        if self.macaron:
            self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n')
            self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout,
                                                         sequence='da', variational=self.variational)

            self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                           variational=self.variational,
                                                           activation=opt.ffn_activation, glu=opt.ffn_glu)

        self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                  variational=self.variational)

        if opt.fast_self_attention:
            self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
        else:
            self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
            self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                          variational=self.variational)

            if not opt.fast_xattention:
                self.multihead_src = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2)
            else:
                self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout)

        self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                 variational=self.variational)

        if not opt.fast_feed_forward:

            feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                       variational=self.variational,
                                                       activation=opt.ffn_activation, glu=opt.ffn_glu)
Ejemplo n.º 9
0
    def __init__(self, opt, death_rate=0.0, **kwargs):
        super(RelativeTransformerEncoderLayer, self).__init__()
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)
        d_head = opt.model_size // opt.n_heads
        if not self.fast_self_attention:
            self.multihead = RelPartialLearnableMultiHeadAttn(
                opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout)
        else:
            self.multihead = RelativeSelfMultiheadAttn(opt.model_size,
                                                       opt.n_heads,
                                                       opt.attn_dropout)

        print(opt.fast_feed_forward)
        if not opt.fast_feed_forward:
            feedforward = FeedForward(opt.model_size,
                                      opt.inner_size,
                                      opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)
Ejemplo n.º 10
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(FCTDecoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=True)

        self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_src_attn = PrePostProcessing(d_model,
                                                      p,
                                                      sequence='da',
                                                      static=True)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=True)

        #~ self.multihead_tgt = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead_tgt = UniformMultiHeadAttention(h,
                                                       d_model,
                                                       attn_p=attn_p)
        #~ self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead_src = UniformMultiHeadAttention(h,
                                                       d_model,
                                                       attn_p=attn_p)

        if onmt.constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Ejemplo n.º 11
0
    def __init__(self, opt, death_rate=0.0):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        # self.lfv_multilingual = opt.lfv_multilingual

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size,
                                                         opt.dropout,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)

            if opt.fast_xattention:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)
            else:
                self.multihead_src = MultiHeadAttention(
                    opt.n_heads,
                    opt.model_size,
                    attn_p=opt.attn_dropout,
                    share=2)

        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if not self.fast_self_attention:
            self.multihead_tgt = RelPartialLearnableMultiHeadAttn(
                opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout)
        else:
            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size, opt.n_heads, opt.attn_dropout)

        if not opt.fast_feed_forward:
            feedforward = FeedForward(opt.model_size,
                                      opt.inner_size,
                                      opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)