def __init__( self, # network structure related idim, odim, adim=384, aheads=4, elayers=6, eunits=1536, dlayers=6, dunits=1536, postnet_layers=5, postnet_chans=256, postnet_filts=5, positionwise_layer_type="conv1d", positionwise_conv_kernel_size=1, use_scaled_pos_enc=True, use_batch_norm=True, encoder_normalize_before=True, decoder_normalize_before=True, encoder_concat_after=False, decoder_concat_after=False, reduction_factor=1, # encoder / decoder conformer_pos_enc_layer_type="rel_pos", conformer_self_attn_layer_type="rel_selfattn", conformer_activation_type="swish", use_macaron_style_in_conformer=True, use_cnn_in_conformer=True, conformer_enc_kernel_size=7, conformer_dec_kernel_size=31, # duration predictor duration_predictor_layers=2, duration_predictor_chans=256, duration_predictor_kernel_size=3, # energy predictor energy_predictor_layers=2, energy_predictor_chans=256, energy_predictor_kernel_size=3, energy_predictor_dropout=0.5, energy_embed_kernel_size=1, energy_embed_dropout=0.0, stop_gradient_from_energy_predictor=True, # pitch predictor pitch_predictor_layers=5, pitch_predictor_chans=256, pitch_predictor_kernel_size=5, pitch_predictor_dropout=0.5, pitch_embed_kernel_size=1, pitch_embed_dropout=0.0, stop_gradient_from_pitch_predictor=True, # pretrained spk emb spk_embed_dim=None, # training related transformer_enc_dropout_rate=0.2, transformer_enc_positional_dropout_rate=0.2, transformer_enc_attn_dropout_rate=0.2, transformer_dec_dropout_rate=0.2, transformer_dec_positional_dropout_rate=0.2, transformer_dec_attn_dropout_rate=0.2, duration_predictor_dropout_rate=0.2, postnet_dropout_rate=0.5, init_type="kaiming_uniform", init_enc_alpha=1.0, init_dec_alpha=1.0, use_masking=False, use_weighted_masking=True, lang='en'): super().__init__() self.idim = idim self.odim = odim self.reduction_factor = reduction_factor self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor self.use_scaled_pos_enc = use_scaled_pos_enc self.spk_embed_dim = spk_embed_dim self.padding_idx = 0 encoder_input_layer = torch.nn.Embedding(num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx) self.encoder = Conformer( idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers, input_layer=encoder_input_layer, dropout_rate=transformer_enc_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate, normalize_before=encoder_normalize_before, concat_after=encoder_concat_after, positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer, use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_enc_kernel_size) if self.spk_embed_dim is not None: self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim) self.duration_predictor = DurationPredictor( idim=adim, n_layers=duration_predictor_layers, n_chans=duration_predictor_chans, kernel_size=duration_predictor_kernel_size, dropout_rate=duration_predictor_dropout_rate, ) self.pitch_predictor = VariancePredictor( idim=adim, n_layers=pitch_predictor_layers, n_chans=pitch_predictor_chans, kernel_size=pitch_predictor_kernel_size, dropout_rate=pitch_predictor_dropout) self.pitch_embed = torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=adim, kernel_size=pitch_embed_kernel_size, padding=(pitch_embed_kernel_size - 1) // 2), torch.nn.Dropout(pitch_embed_dropout)) self.energy_predictor = VariancePredictor( idim=adim, n_layers=energy_predictor_layers, n_chans=energy_predictor_chans, kernel_size=energy_predictor_kernel_size, dropout_rate=energy_predictor_dropout) self.energy_embed = torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=adim, kernel_size=energy_embed_kernel_size, padding=(energy_embed_kernel_size - 1) // 2), torch.nn.Dropout(energy_embed_dropout)) self.length_regulator = LengthRegulator() self.decoder = Conformer( idim=0, attention_dim=adim, attention_heads=aheads, linear_units=dunits, num_blocks=dlayers, input_layer=None, dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate, attention_dropout_rate=transformer_dec_attn_dropout_rate, normalize_before=decoder_normalize_before, concat_after=decoder_concat_after, positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer, use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_dec_kernel_size) self.feat_out = torch.nn.Linear(adim, odim * reduction_factor) self.postnet = PostNet(idim=idim, odim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=use_batch_norm, dropout_rate=postnet_dropout_rate) self.load_state_dict( torch.load(os.path.join("Models", "FastSpeech2_Elizabeth", "best.pt"), map_location='cpu')["model"])
def __init__( self, # network structure related idim, odim, embed_dim=0, eprenet_conv_layers=0, eprenet_conv_chans=0, eprenet_conv_filts=0, dprenet_layers=2, dprenet_units=256, elayers=6, eunits=1024, adim=512, aheads=4, dlayers=6, dunits=1024, postnet_layers=5, postnet_chans=256, postnet_filts=5, positionwise_layer_type="conv1d", positionwise_conv_kernel_size=1, use_scaled_pos_enc=True, use_batch_norm=True, encoder_normalize_before=True, decoder_normalize_before=True, encoder_concat_after=True, # True according to https://github.com/soobinseo/Transformer-TTS decoder_concat_after=True, # True according to https://github.com/soobinseo/Transformer-TTS reduction_factor=1, spk_embed_dim=None, spk_embed_integration_type="concat", # training related transformer_enc_dropout_rate=0.1, transformer_enc_positional_dropout_rate=0.1, transformer_enc_attn_dropout_rate=0.1, transformer_dec_dropout_rate=0.1, transformer_dec_positional_dropout_rate=0.1, transformer_dec_attn_dropout_rate=0.1, transformer_enc_dec_attn_dropout_rate=0.1, eprenet_dropout_rate=0.0, dprenet_dropout_rate=0.5, postnet_dropout_rate=0.5, init_type="xavier_uniform", # since we have little to no # asymetric activations, this seems to work better than kaiming init_enc_alpha=1.0, use_masking=False, # either this or weighted masking, not both use_weighted_masking=True, # if there are severely different sized samples in one batch bce_pos_weight=7.0, # scaling the loss of the stop token prediction loss_type="L1", use_guided_attn_loss=True, num_heads_applied_guided_attn=2, num_layers_applied_guided_attn=2, modules_applied_guided_attn=("encoder-decoder", ), guided_attn_loss_sigma=0.4, # standard deviation from diagonal that is allowed guided_attn_loss_lambda=25.0): super().__init__() self.idim = idim self.odim = odim self.eos = idim - 1 self.spk_embed_dim = spk_embed_dim self.reduction_factor = reduction_factor self.use_guided_attn_loss = use_guided_attn_loss self.use_scaled_pos_enc = use_scaled_pos_enc self.use_guided_attn_loss = use_guided_attn_loss if self.use_guided_attn_loss: if num_layers_applied_guided_attn == -1: self.num_layers_applied_guided_attn = elayers else: self.num_layers_applied_guided_attn = num_layers_applied_guided_attn if num_heads_applied_guided_attn == -1: self.num_heads_applied_guided_attn = aheads else: self.num_heads_applied_guided_attn = num_heads_applied_guided_attn self.modules_applied_guided_attn = modules_applied_guided_attn if self.spk_embed_dim is not None: self.spk_embed_integration_type = spk_embed_integration_type self.padding_idx = 0 pos_enc_class = (ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding) if eprenet_conv_layers != 0: encoder_input_layer = torch.nn.Sequential( EncoderPrenet(idim=idim, embed_dim=embed_dim, elayers=0, econv_layers=eprenet_conv_layers, econv_chans=eprenet_conv_chans, econv_filts=eprenet_conv_filts, use_batch_norm=use_batch_norm, dropout_rate=eprenet_dropout_rate, padding_idx=self.padding_idx), torch.nn.Linear(eprenet_conv_chans, adim)) else: encoder_input_layer = torch.nn.Embedding( num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx) self.encoder = Encoder( idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers, input_layer=encoder_input_layer, dropout_rate=transformer_enc_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate, pos_enc_class=pos_enc_class, normalize_before=encoder_normalize_before, concat_after=encoder_concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size) if self.spk_embed_dim is not None: self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim) decoder_input_layer = torch.nn.Sequential( DecoderPrenet(idim=odim, n_layers=dprenet_layers, n_units=dprenet_units, dropout_rate=dprenet_dropout_rate), torch.nn.Linear(dprenet_units, adim)) self.decoder = Decoder( odim=odim, attention_dim=adim, attention_heads=aheads, linear_units=dunits, num_blocks=dlayers, dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate, self_attention_dropout_rate=transformer_dec_attn_dropout_rate, src_attention_dropout_rate=transformer_enc_dec_attn_dropout_rate, input_layer=decoder_input_layer, use_output_layer=False, pos_enc_class=pos_enc_class, normalize_before=decoder_normalize_before, concat_after=decoder_concat_after) self.feat_out = torch.nn.Linear(adim, odim * reduction_factor) self.prob_out = torch.nn.Linear(adim, reduction_factor) self.postnet = PostNet(idim=idim, odim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=use_batch_norm, dropout_rate=postnet_dropout_rate) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda) self.criterion = TransformerLoss( use_masking=use_masking, use_weighted_masking=use_weighted_masking, bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda) self.load_state_dict( torch.load(os.path.join("Models", "TransformerTTS_Eva", "best.pt"), map_location='cpu')["model"])