Example #1
0
    def __init__(self, opt, embedding, language_embeddings=None, **kwargs):
        super(SpeechLSTMDecoder, self).__init__()

        # Keep for reference

        # Define layers
        self.model_size = opt.model_size
        self.layers = opt.layers
        self.dropout = opt.dropout

        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.variational_dropout = opt.variational_dropout

        self.encoder_type = opt.encoder_type

        self.lstm = nn.LSTM(self.model_size,
                            self.model_size,
                            self.layers,
                            dropout=self.dropout,
                            batch_first=True)

        self.fast_xattention = opt.fast_xattention
        self.n_head = 1  # fixed
        # also fix attention dropout to 0.0

        if opt.fast_xattention:
            self.multihead_tgt = EncdecMultiheadAttn(self.n_head,
                                                     opt.model_size, 0.0)
        else:
            self.multihead_tgt = MultiHeadAttention(self.n_head,
                                                    opt.model_size,
                                                    attn_p=0.0,
                                                    share=3)

        self.preprocess_layer = PrePostProcessing(
            self.model_size,
            self.emb_dropout,
            sequence='d',
            variational=self.variational_dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')
        self.preprocess_attn = PrePostProcessing(self.model_size,
                                                 0,
                                                 sequence='n')

        self.word_lut = embedding

        self.encoder_cnn_downsampling = opt.cnn_downsampling
        self.language_embeddings = language_embeddings
        self.use_language_embedding = opt.use_language_embedding
        self.language_embedding_type = opt.language_embedding_type

        if self.language_embedding_type == 'concat':
            self.projector = nn.Linear(opt.model_size * 2, opt.model_size)

        print("* Create LSTM Decoder with %d layers." % self.layers)
Example #2
0
    def __init__(self, opt, embedding, language_embeddings=None, **kwargs):
        super(SpeechLSTMDecoder, self).__init__()

        # Keep for reference

        # Define layers
        self.model_size = opt.model_size
        self.layers = opt.layers
        self.dropout = opt.dropout

        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.variational_dropout = opt.variational_dropout
        self.multilingual_factorized_weights = opt.multilingual_factorized_weights
        self.mfw_rank = opt.mfw_rank
        self.encoder_type = opt.encoder_type
        self.n_languages = opt.n_languages

        self.lstm = nn.LSTM(self.model_size, self.model_size, self.layers, dropout=self.dropout, batch_first=True)
        if self.multilingual_factorized_weights:
            from onmt.modules.weight_control_lstm import WeightFactoredLSTM
            self.lstm = WeightFactoredLSTM(self.lstm, dropout=opt.weight_drop, n_languages=opt.n_languages,
                                           rank=self.mfw_rank)

        self.fast_xattention = opt.fast_xattention
        self.n_head = 1  # fixed to always use 1 head
        # also fix attention dropout to 0.0

        if self.multilingual_factorized_weights:
            self.fast_xattention = True
            from onmt.modules.multilingual_factorized.encdec_attention import MFWEncdecMultiheadAttn
            self.multihead_tgt = MFWEncdecMultiheadAttn(self.n_head, opt.model_size, 0.0, n_languages=opt.n_languages,
                                                        rank=opt.mfw_rank, weight_drop=0.0)
        else:
            if opt.fast_xattention:
                self.multihead_tgt = EncdecMultiheadAttn(self.n_head, opt.model_size, 0.0)
            else:
                self.multihead_tgt = MultiHeadAttention(self.n_head, opt.model_size, attn_p=0.0, share=3)

        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d',
                                                  variational=self.variational_dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
        self.preprocess_attn = PrePostProcessing(self.model_size, 0, sequence='n')

        self.word_lut = embedding

        self.encoder_cnn_downsampling = opt.cnn_downsampling
        self.language_embeddings = language_embeddings
        self.use_language_embedding = opt.use_language_embedding
        self.language_embedding_type = opt.language_embedding_type

        if self.language_embedding_type == 'concat':
            self.projector = nn.Linear(opt.model_size * 2, opt.model_size)

        print("* Create LSTM Decoder with %d layers." % self.layers)
Example #3
0
 def __init__(self, opt):
     super().__init__()
     self.layer_norm = nn.LayerNorm((opt.model_size, ),
                                    elementwise_affine=True)
     self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout
     self.attn = EncdecMultiheadAttn(opt.n_heads,
                                     opt.model_size,
                                     attn_drop=opt.attn_dropout)
     self.dropout = opt.attn_dropout
     self.variational = opt.variational_dropout
Example #4
0
    def __init__(self, opt):
        super(TacotronDecoder, self).__init__()
        self.n_mel_channels = opt.n_mel_channels
        self.n_frames_per_step = opt.n_frames_per_step
        self.encoder_embedding_dim = opt.model_size
        self.attention_rnn_dim = opt.model_size
        self.decoder_rnn_dim = opt.model_size
        self.prenet_dim = opt.prenet_dim
        self.max_decoder_steps = opt.max_decoder_steps
        self.gate_threshold = 0.5
        self.p_attention_dropout = opt.attn_dropout
        self.p_decoder_dropout = opt.dropout
        self.encoder_type = opt.encoder_type

        self.lstm = nn.LSTM(opt.prenet_dim, opt.model_size, 2, dropout=opt.dropout, batch_first=True)

        self.linear_trans = nn.Linear(opt.n_mel_channels * opt.n_frames_per_step , opt.model_size)
        torch.nn.init.xavier_uniform_(self.linear_trans.weight)

        if opt.fast_xattention:
            self.multihead_tgt =  EncdecMultiheadAttn(1, opt.model_size, opt.attn_dropout)
        else:
            self.multihead_tgt = MultiHeadAttention(1, opt.model_size, attn_p=opt.attn_dropout, share=3)

        self.preprocess_layer = PrePostProcessing(opt.model_size, 0, sequence='n')

        self.prenet = Prenet(
            opt.n_mel_channels * opt.n_frames_per_step,
            [opt.prenet_dim, opt.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            opt.prenet_dim + opt.model_size,
            opt.model_size)

        self.attention_layer = Attention(
            opt.model_size, opt.model_size,
            opt.attention_dim, opt.attention_location_n_filters,
            opt.attention_location_kernel_size)

        self.postprocess_layer = PrePostProcessing(opt.model_size, 0, sequence='n')

        self.decoder_rnn = nn.LSTMCell(
            opt.model_size + opt.model_size,
            opt.model_size, 1)

        self.linear_projection = LinearNorm(
            opt.model_size ,
            opt.n_mel_channels * opt.n_frames_per_step)

        self.gate_layer = LinearNorm(
            opt.model_size , 1,
            bias=True, w_init_gain='sigmoid')
Example #5
0
    def __init__(self, opt, death_rate=0.0):
        super(DecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1

        if self.macaron:
            self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n')
            self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout,
                                                         sequence='da', variational=self.variational)

            self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                           variational=self.variational,
                                                           activation=opt.ffn_activation, glu=opt.ffn_glu)

        self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                  variational=self.variational)

        if opt.fast_self_attention:
            self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
        else:
            self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
            self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                          variational=self.variational)

            if not opt.fast_xattention:
                self.multihead_src = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2)
            else:
                self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout)

        self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                 variational=self.variational)

        if not opt.fast_feed_forward:

            feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                       variational=self.variational,
                                                       activation=opt.ffn_activation, glu=opt.ffn_glu)
Example #6
0
    def __init__(self, opt, embedding, language_embeddings=None, ignore_source=False, allocate_positions=True):
        super(SpeechLSTMDecoder, self).__init__()

        # Keep for reference

        # Define layers
        self.model_size = opt.model_size
        self.layers = opt.layers
        self.dropout = opt.dropout

        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.variational_dropout = opt.variational_dropout

        self.encoder_type = opt.encoder_type

        self.lstm = nn.LSTM(self.model_size, self.model_size, self.layers, dropout=self.dropout, batch_first=True)

        self.fast_self_attention = opt.fast_self_attention

        if opt.fast_xattention:
            self.multihead_tgt =  EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout)
        else:
            self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=3)

        # self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d',
        #                                           variational=self.variational_dropout)
        self.preprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')

        self.word_lut = embedding

        self.encoder_cnn_downsampling = opt.cnn_downsampling
        self.language_embeddings = language_embeddings
        self.use_language_embedding = opt.use_language_embedding
        self.gumbel_embedding = opt.gumbel_embedding
        self.bottleneck = opt.bottleneck
        self.language_embedding_type = opt.language_embedding_type

        if self.language_embedding_type == 'concat':
            self.projector = nn.Linear(opt.model_size * 2, opt.model_size)
    def __init__(self, opt, death_rate=0.0):
        super().__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        self.factor_size = opt.layers
        self.adaptive_type = opt.adaptive

        self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
            self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                          variational=self.variational)

            if self.adaptive_type == 'universal':
                self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout)
            else:
                self.multihead_src = AdaptiveEncDecAttn(opt.n_heads, opt.model_size, self.factor_size, opt.attn_dropout)

        self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da',
                                                 variational=self.variational)

        if self.adaptive_type == 'universal':
            self.multihead_tgt = RelativeSelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)

            self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout,
                                                  variational=self.variational)
        else:
            self.multihead_tgt = AdaptiveRelativeAttn(opt.model_size, opt.n_heads, self.factor_size,
                                                      opt.attn_dropout)
            self.feedforward = AdaptiveFeedForward(opt.model_size, opt.inner_size, self.factor_size,
                                                   opt.dropout, variational=self.variational)
Example #8
0
    def __init__(self, opt, death_rate=0.0):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.fast_self_attention = opt.fast_self_attention
        # self.lfv_multilingual = opt.lfv_multilingual

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size,
                                                         opt.dropout,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)

            if opt.fast_xattention:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)
            else:
                self.multihead_src = MultiHeadAttention(
                    opt.n_heads,
                    opt.model_size,
                    attn_p=opt.attn_dropout,
                    share=2)

        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if not self.fast_self_attention:
            self.multihead_tgt = RelPartialLearnableMultiHeadAttn(
                opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout)
        else:
            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size, opt.n_heads, opt.attn_dropout)

        if not opt.fast_feed_forward:
            feedforward = FeedForward(opt.model_size,
                                      opt.inner_size,
                                      opt.dropout,
                                      variational=self.variational)
            self.feedforward = Bottle(feedforward)
        else:
            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)
Example #9
0
    def __init__(self, opt, death_rate=0.0, lid_net=None):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.mfw = opt.multilingual_factorized_weights
        self.mpw = opt.multilingual_partitioned_weights
        self.mln = opt.multilingual_layer_norm
        self.weight_drop = opt.weight_drop
        self.multilingual_adapter = opt.multilingual_adapter
        self.adapter_bottleneck_size = opt.adapter_bottleneck_size

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n',
                                                 multilingual=self.mln,
                                                 n_languages=opt.n_languages)
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='n',
                multilingual=self.mln,
                n_languages=opt.n_languages)
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)

            if self.mfw:
                self.multihead_src = MFWEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative,
                    weight_drop=self.weight_drop,
                    mfw_activation=opt.mfw_activation)
            elif self.mpw:
                self.multihead_src = MPEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    factor_size=opt.mpw_factor_size)

            else:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)

        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n',
                                                multilingual=self.mln,
                                                n_languages=opt.n_languages)
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if self.mfw:
            self.feedforward = MFWPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative,
                weight_drop=self.weight_drop,
                mfw_activation=opt.mfw_activation)

            self.multihead_tgt = MFWRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative,
                weight_drop=self.weight_drop,
                mfw_activation=opt.mfw_activation)
        elif self.mpw:
            self.feedforward = MPPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational,
                factor_size=opt.mpw_factor_size)

            self.multihead_tgt = MPRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                factor_size=opt.mpw_factor_size)
        else:
            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size, opt.n_heads, opt.attn_dropout)

            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)

        self.lfv_multilingual = opt.lfv_multilingual

        if opt.lfv_multilingual:
            self.lid_net = lid_net
            self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size)
        else:
            self.lid_net = None
            self.lfv_mapper = None

        if self.multilingual_adapter:

            from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter
            self.adapters = MultilingualAdapter(opt.model_size,
                                                opt.adapter_bottleneck_size,
                                                n_languages=opt.n_languages,
                                                dropout=opt.dropout)
Example #10
0
    def __init__(self, opt, death_rate=0.0):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.batch_ensemble = opt.batch_ensemble
        self.mfw = opt.multilingual_factorized_weights
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1
        self.dropout = opt.dropout

        if self.macaron:
            self.preprocess_mcr_ffn = PrePostProcessing(opt.model_size,
                                                        opt.dropout,
                                                        sequence='n')
            self.postprocess_mcr_ffn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)

            if self.mfw:
                self.mcr_feedforward = MFWPositionWiseFeedForward(
                    opt.model_size,
                    opt.inner_size,
                    opt.dropout,
                    variational=self.variational,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative)
            else:
                self.mcr_feedforward = PositionWiseFeedForward(
                    opt.model_size,
                    opt.inner_size,
                    opt.dropout,
                    variational=self.variational)

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size,
                                                         opt.dropout,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)
            # if self.batch_ensemble > 0:
            #     self.multihead_src = BEEncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout,
            #                                                ensemble=self.batch_ensemble)
            # else:

            if not self.mfw:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)
            else:
                self.multihead_src = MFWEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative)

        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if self.mfw:
            self.feedforward = MFWPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative)

            self.multihead_tgt = MFWRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative)
        else:

            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational)

            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size, opt.n_heads, opt.attn_dropout)
Example #11
0
    def __init__(self, opt, death_rate=0.0):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.batch_ensemble = opt.batch_ensemble
        self.mfw = opt.multilingual_factorized_weights
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1
        self.dropout = opt.dropout
        self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout
        self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout
        self.rezero = opt.rezero
        self.n_heads = opt.n_heads
        self.absolute_position_encoding = opt.absolute_position_encoding
        self.learnable_pos = opt.learnable_position_encoding
        self.stochastic_sublayer = opt.stochastic_sublayer
        self.post_norm = opt.post_norm

        if self.macaron:
            self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size,
                                                    self.post_norm)
            self.postprocess_mcr_ffn = postprocessing(opt.rezero,
                                                      opt.model_size,
                                                      self.residual_dropout,
                                                      self.variational,
                                                      self.post_norm)

            if self.mfw:
                self.mcr_feedforward = MFWPositionWiseFeedForward(
                    opt.model_size,
                    opt.inner_size,
                    self.ffn_dropout,
                    variational=self.variational,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative,
                    no_bias=opt.mfw_no_bias,
                    activation=opt.ffn_activation,
                    glu=opt.ffn_glu)

            else:
                self.mcr_feedforward = PositionWiseFeedForward(
                    opt.model_size,
                    opt.inner_size,
                    self.ffn_dropout,
                    variational=self.variational,
                    activation=opt.ffn_activation,
                    glu=opt.ffn_glu)

        self.preprocess_attn = preprocessing(opt.rezero, opt.model_size,
                                             self.post_norm)
        self.postprocess_attn = postprocessing(opt.rezero, opt.model_size,
                                               self.residual_dropout,
                                               self.variational,
                                               self.post_norm)

        if not self.ignore_source:
            self.preprocess_src_attn = preprocessing(opt.rezero,
                                                     opt.model_size,
                                                     self.post_norm)
            self.postprocess_src_attn = postprocessing(opt.rezero,
                                                       opt.model_size,
                                                       self.residual_dropout,
                                                       self.variational,
                                                       self.post_norm)

            if not self.mfw:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)
            else:
                self.multihead_src = MFWEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative,
                    no_bias=opt.mfw_no_bias,
                )

        self.preprocess_ffn = preprocessing(opt.rezero, opt.model_size,
                                            self.post_norm)
        self.postprocess_ffn = postprocessing(opt.rezero, opt.model_size,
                                              self.residual_dropout,
                                              self.variational, self.post_norm)

        d_head = opt.model_size // opt.n_heads

        if self.mfw:
            self.feedforward = MFWPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                self.ffn_dropout,
                variational=self.variational,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative,
                no_bias=opt.mfw_no_bias,
                activation=opt.ffn_activation,
                glu=opt.ffn_glu)

            self.multihead_tgt = MFWRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative,
                no_bias=opt.mfw_no_bias,
            )
        else:

            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                self.ffn_dropout,
                variational=self.variational,
                activation=opt.ffn_activation,
                glu=opt.ffn_glu)

            if not self.absolute_position_encoding:
                self.multihead_tgt = RelativeSelfMultiheadAttn(
                    opt.model_size,
                    opt.n_heads,
                    opt.attn_dropout,
                    learnable_pos=self.learnable_pos,
                    max_pos=opt.max_pos_length)

            else:
                self.multihead_tgt = SelfMultiheadAttn(opt.model_size,
                                                       opt.n_heads,
                                                       opt.attn_dropout)
Example #12
0
    def __init__(self, opt, death_rate=0.0, lid_net=None):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.mfw = opt.multilingual_factorized_weights
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1

        self.preprocess_attn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='n')
        self.postprocess_attn = PrePostProcessing(opt.model_size,
                                                  opt.dropout,
                                                  sequence='da',
                                                  variational=self.variational)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(opt.model_size,
                                                         opt.dropout,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                opt.dropout,
                sequence='da',
                variational=self.variational)

            if self.mfw:
                self.multihead_src = MFWEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative)
            else:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)

        self.preprocess_ffn = PrePostProcessing(opt.model_size,
                                                opt.dropout,
                                                sequence='n')
        self.postprocess_ffn = PrePostProcessing(opt.model_size,
                                                 opt.dropout,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if not self.mfw:
            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size, opt.n_heads, opt.attn_dropout)

            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational,
                activation=opt.activation)
        else:
            self.feedforward = MFWPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                opt.dropout,
                variational=self.variational,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative)

            self.multihead_tgt = MFWRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative)

        self.lfv_multilingual = opt.lfv_multilingual

        if opt.lfv_multilingual:
            self.lid_net = lid_net
            self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size)
        else:
            self.lid_net = None
            self.lfv_mapper = None
    def __init__(self, opt, death_rate=0.0, lid_net=None):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.ignore_source = opt.ignore_source
        self.variational = opt.variational_dropout
        self.death_rate = death_rate
        self.mfw = opt.multilingual_factorized_weights
        self.mpw = opt.multilingual_partitioned_weights
        self.mln = opt.multilingual_layer_norm
        self.weight_drop = opt.weight_drop
        self.multilingual_adapter = opt.multilingual_adapter
        self.adapter_bottleneck_size = opt.adapter_bottleneck_size
        self.macaron = opt.macaron
        self.ffn_scale = 0.5 if self.macaron else 1
        self.rezero = opt.rezero
        self.learnable_pos = opt.learnable_position_encoding
        self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout
        self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout

        self.preprocess_attn = preprocessing(self.rezero,
                                             opt.model_size,
                                             0.0,
                                             sequence='n',
                                             multilingual=self.mln,
                                             n_languages=opt.n_languages)

        self.postprocess_attn = PrePostProcessing(
            opt.model_size,
            self.residual_dropout,
            sequence='dz' if self.rezero else 'da',
            variational=self.variational)

        if self.macaron:
            self.preprocess_mcr_ffn = preprocessing(
                self.rezero,
                opt.model_size,
                0.0,
                sequence='n',
                multilingual=self.mln,
                n_languages=opt.n_languages)
            self.postprocess_mcr_ffn = PrePostProcessing(
                opt.model_size,
                self.residual_dropout,
                sequence='dz' if self.rezero else 'da',
                variational=self.variational)

            if self.mfw:
                self.mcr_feedforward = MFWPositionWiseFeedForward(
                    opt.model_size,
                    opt.inner_size,
                    self.ffn_dropoutt,
                    variational=self.variational,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative,
                    activation=opt.ffn_activation,
                    glu=opt.ffn_glu)
            else:
                self.mcr_feedforward = PositionWiseFeedForward(
                    opt.model_size,
                    opt.inner_size,
                    self.ffn_dropout,
                    variational=self.variational,
                    activation=opt.ffn_activation,
                    glu=opt.ffn_glu)

        if not self.ignore_source:
            self.preprocess_src_attn = preprocessing(
                self.rezero,
                opt.model_size,
                0.0,
                sequence='n',
                multilingual=self.mln,
                n_languages=opt.n_languages)
            self.postprocess_src_attn = PrePostProcessing(
                opt.model_size,
                self.residual_dropout,
                sequence='dz' if self.rezero else 'da',
                variational=self.variational)

            if self.mfw:
                self.multihead_src = MFWEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    n_languages=opt.n_languages,
                    rank=opt.mfw_rank,
                    use_multiplicative=opt.mfw_multiplicative,
                    weight_drop=self.weight_drop,
                    mfw_activation=opt.mfw_activation)
            elif self.mpw:
                self.multihead_src = MPEncdecMultiheadAttn(
                    opt.n_heads,
                    opt.model_size,
                    opt.attn_dropout,
                    factor_size=opt.mpw_factor_size)

            else:
                self.multihead_src = EncdecMultiheadAttn(
                    opt.n_heads, opt.model_size, opt.attn_dropout)

        self.preprocess_ffn = preprocessing(self.rezero,
                                            opt.model_size,
                                            0.0,
                                            sequence='n',
                                            multilingual=self.mln,
                                            n_languages=opt.n_languages)
        self.postprocess_ffn = PrePostProcessing(
            opt.model_size,
            self.residual_dropout,
            sequence='dz' if self.rezero else 'da',
            variational=self.variational)

        d_head = opt.model_size // opt.n_heads

        if self.mfw:
            self.feedforward = MFWPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                self.ffn_dropout,
                variational=self.variational,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative,
                weight_drop=self.weight_drop,
                mfw_activation=opt.mfw_activation,
                activation=opt.ffn_activation,
                glu=opt.ffn_glu)

            self.multihead_tgt = MFWRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                n_languages=opt.n_languages,
                rank=opt.mfw_rank,
                use_multiplicative=opt.mfw_multiplicative,
                weight_drop=self.weight_drop,
                mfw_activation=opt.mfw_activation)
        elif self.mpw:
            self.feedforward = MPPositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                self.ffn_dropout,
                variational=self.variational,
                factor_size=opt.mpw_factor_size)

            self.multihead_tgt = MPRelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                factor_size=opt.mpw_factor_size)
        else:
            self.multihead_tgt = RelativeSelfMultiheadAttn(
                opt.model_size,
                opt.n_heads,
                opt.attn_dropout,
                learnable_pos=self.learnable_pos,
                max_pos=opt.max_pos_length)

            self.feedforward = PositionWiseFeedForward(
                opt.model_size,
                opt.inner_size,
                self.ffn_dropout,
                variational=self.variational,
                activation=opt.ffn_activation,
                glu=opt.ffn_glu)

        # self.lfv_multilingual = opt.lfv_multilingual
        #
        # if opt.lfv_multilingual:
        #     self.lid_net = lid_net
        #     self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size)
        # else:
        #     self.lid_net = None
        #     self.lfv_mapper = None

        if self.multilingual_adapter:
            from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter
            self.adapters = MultilingualAdapter(opt.model_size,
                                                opt.adapter_bottleneck_size,
                                                n_languages=opt.n_languages,
                                                dropout=opt.dropout)