def __init__(self, opt, embedding, language_embeddings=None, **kwargs): super(SpeechLSTMDecoder, self).__init__() # Keep for reference # Define layers self.model_size = opt.model_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.variational_dropout = opt.variational_dropout self.multilingual_factorized_weights = opt.multilingual_factorized_weights self.mfw_rank = opt.mfw_rank self.encoder_type = opt.encoder_type self.n_languages = opt.n_languages self.lstm = nn.LSTM(self.model_size, self.model_size, self.layers, dropout=self.dropout, batch_first=True) if self.multilingual_factorized_weights: from onmt.modules.weight_control_lstm import WeightFactoredLSTM self.lstm = WeightFactoredLSTM(self.lstm, dropout=opt.weight_drop, n_languages=opt.n_languages, rank=self.mfw_rank) self.fast_xattention = opt.fast_xattention self.n_head = 1 # fixed to always use 1 head # also fix attention dropout to 0.0 if self.multilingual_factorized_weights: self.fast_xattention = True from onmt.modules.multilingual_factorized.encdec_attention import MFWEncdecMultiheadAttn self.multihead_tgt = MFWEncdecMultiheadAttn(self.n_head, opt.model_size, 0.0, n_languages=opt.n_languages, rank=opt.mfw_rank, weight_drop=0.0) else: if opt.fast_xattention: self.multihead_tgt = EncdecMultiheadAttn(self.n_head, opt.model_size, 0.0) else: self.multihead_tgt = MultiHeadAttention(self.n_head, opt.model_size, attn_p=0.0, share=3) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', variational=self.variational_dropout) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.preprocess_attn = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = embedding self.encoder_cnn_downsampling = opt.cnn_downsampling self.language_embeddings = language_embeddings self.use_language_embedding = opt.use_language_embedding self.language_embedding_type = opt.language_embedding_type if self.language_embedding_type == 'concat': self.projector = nn.Linear(opt.model_size * 2, opt.model_size) print("* Create LSTM Decoder with %d layers." % self.layers)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.lfv_multilingual = opt.lfv_multilingual if opt.lfv_multilingual: self.lid_net = lid_net self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) else: self.lid_net = None self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.batch_ensemble = opt.batch_ensemble self.mfw = opt.multilingual_factorized_weights self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.dropout = opt.dropout if self.macaron: self.preprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_mcr_ffn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) # if self.batch_ensemble > 0: # self.multihead_src = BEEncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout, # ensemble=self.batch_ensemble) # else: if not self.mfw: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.batch_ensemble = opt.batch_ensemble self.mfw = opt.multilingual_factorized_weights self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.dropout = opt.dropout self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout self.rezero = opt.rezero self.n_heads = opt.n_heads self.absolute_position_encoding = opt.absolute_position_encoding self.learnable_pos = opt.learnable_position_encoding self.stochastic_sublayer = opt.stochastic_sublayer self.post_norm = opt.post_norm if self.macaron: self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_mcr_ffn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, activation=opt.ffn_activation, glu=opt.ffn_glu) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) self.preprocess_attn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_attn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) if not self.ignore_source: self.preprocess_src_attn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_src_attn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) if not self.mfw: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, ) self.preprocess_ffn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_ffn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, activation=opt.ffn_activation, glu=opt.ffn_glu) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, ) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) if not self.absolute_position_encoding: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, learnable_pos=self.learnable_pos, max_pos=opt.max_pos_length) else: self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.mfw: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.activation) else: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.lfv_multilingual = opt.lfv_multilingual if opt.lfv_multilingual: self.lid_net = lid_net self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) else: self.lid_net = None self.lfv_mapper = None
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.rezero = opt.rezero self.learnable_pos = opt.learnable_position_encoding self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout self.preprocess_attn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.macaron: self.preprocess_mcr_ffn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_mcr_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropoutt, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, activation=opt.ffn_activation, glu=opt.ffn_glu) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) if not self.ignore_source: self.preprocess_src_attn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation, activation=opt.ffn_activation, glu=opt.ffn_glu) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, learnable_pos=self.learnable_pos, max_pos=opt.max_pos_length) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) # self.lfv_multilingual = opt.lfv_multilingual # # if opt.lfv_multilingual: # self.lid_net = lid_net # self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) # else: # self.lid_net = None # self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)