def __init__(
         self,  # network structure related
         idim,
         odim,
         adim=384,
         aheads=4,
         elayers=6,
         eunits=1536,
         dlayers=6,
         dunits=1536,
         postnet_layers=5,
         postnet_chans=256,
         postnet_filts=5,
         positionwise_layer_type="conv1d",
         positionwise_conv_kernel_size=1,
         use_scaled_pos_enc=True,
         use_batch_norm=True,
         encoder_normalize_before=True,
         decoder_normalize_before=True,
         encoder_concat_after=False,
         decoder_concat_after=False,
         reduction_factor=1,
         # encoder / decoder
         conformer_pos_enc_layer_type="rel_pos",
         conformer_self_attn_layer_type="rel_selfattn",
         conformer_activation_type="swish",
         use_macaron_style_in_conformer=True,
         use_cnn_in_conformer=True,
         conformer_enc_kernel_size=7,
         conformer_dec_kernel_size=31,  # duration predictor
         duration_predictor_layers=2,
         duration_predictor_chans=256,
         duration_predictor_kernel_size=3,  # energy predictor
         energy_predictor_layers=2,
         energy_predictor_chans=256,
         energy_predictor_kernel_size=3,
         energy_predictor_dropout=0.5,
         energy_embed_kernel_size=1,
         energy_embed_dropout=0.0,
         stop_gradient_from_energy_predictor=True,  # pitch predictor
         pitch_predictor_layers=5,
         pitch_predictor_chans=256,
         pitch_predictor_kernel_size=5,
         pitch_predictor_dropout=0.5,
         pitch_embed_kernel_size=1,
         pitch_embed_dropout=0.0,
         stop_gradient_from_pitch_predictor=True,  # pretrained spk emb
         spk_embed_dim=None,  # training related
         transformer_enc_dropout_rate=0.2,
         transformer_enc_positional_dropout_rate=0.2,
         transformer_enc_attn_dropout_rate=0.2,
         transformer_dec_dropout_rate=0.2,
         transformer_dec_positional_dropout_rate=0.2,
         transformer_dec_attn_dropout_rate=0.2,
         duration_predictor_dropout_rate=0.2,
         postnet_dropout_rate=0.5,
         init_type="kaiming_uniform",
         init_enc_alpha=1.0,
         init_dec_alpha=1.0,
         use_masking=False,
         use_weighted_masking=True,
         lang='en'):
     super().__init__()
     self.idim = idim
     self.odim = odim
     self.reduction_factor = reduction_factor
     self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
     self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
     self.use_scaled_pos_enc = use_scaled_pos_enc
     self.spk_embed_dim = spk_embed_dim
     self.padding_idx = 0
     encoder_input_layer = torch.nn.Embedding(num_embeddings=idim,
                                              embedding_dim=adim,
                                              padding_idx=self.padding_idx)
     self.encoder = Conformer(
         idim=idim,
         attention_dim=adim,
         attention_heads=aheads,
         linear_units=eunits,
         num_blocks=elayers,
         input_layer=encoder_input_layer,
         dropout_rate=transformer_enc_dropout_rate,
         positional_dropout_rate=transformer_enc_positional_dropout_rate,
         attention_dropout_rate=transformer_enc_attn_dropout_rate,
         normalize_before=encoder_normalize_before,
         concat_after=encoder_concat_after,
         positionwise_conv_kernel_size=positionwise_conv_kernel_size,
         macaron_style=use_macaron_style_in_conformer,
         use_cnn_module=use_cnn_in_conformer,
         cnn_module_kernel=conformer_enc_kernel_size)
     if self.spk_embed_dim is not None:
         self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
     self.duration_predictor = DurationPredictor(
         idim=adim,
         n_layers=duration_predictor_layers,
         n_chans=duration_predictor_chans,
         kernel_size=duration_predictor_kernel_size,
         dropout_rate=duration_predictor_dropout_rate,
     )
     self.pitch_predictor = VariancePredictor(
         idim=adim,
         n_layers=pitch_predictor_layers,
         n_chans=pitch_predictor_chans,
         kernel_size=pitch_predictor_kernel_size,
         dropout_rate=pitch_predictor_dropout)
     self.pitch_embed = torch.nn.Sequential(
         torch.nn.Conv1d(in_channels=1,
                         out_channels=adim,
                         kernel_size=pitch_embed_kernel_size,
                         padding=(pitch_embed_kernel_size - 1) // 2),
         torch.nn.Dropout(pitch_embed_dropout))
     self.energy_predictor = VariancePredictor(
         idim=adim,
         n_layers=energy_predictor_layers,
         n_chans=energy_predictor_chans,
         kernel_size=energy_predictor_kernel_size,
         dropout_rate=energy_predictor_dropout)
     self.energy_embed = torch.nn.Sequential(
         torch.nn.Conv1d(in_channels=1,
                         out_channels=adim,
                         kernel_size=energy_embed_kernel_size,
                         padding=(energy_embed_kernel_size - 1) // 2),
         torch.nn.Dropout(energy_embed_dropout))
     self.length_regulator = LengthRegulator()
     self.decoder = Conformer(
         idim=0,
         attention_dim=adim,
         attention_heads=aheads,
         linear_units=dunits,
         num_blocks=dlayers,
         input_layer=None,
         dropout_rate=transformer_dec_dropout_rate,
         positional_dropout_rate=transformer_dec_positional_dropout_rate,
         attention_dropout_rate=transformer_dec_attn_dropout_rate,
         normalize_before=decoder_normalize_before,
         concat_after=decoder_concat_after,
         positionwise_conv_kernel_size=positionwise_conv_kernel_size,
         macaron_style=use_macaron_style_in_conformer,
         use_cnn_module=use_cnn_in_conformer,
         cnn_module_kernel=conformer_dec_kernel_size)
     self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
     self.postnet = PostNet(idim=idim,
                            odim=odim,
                            n_layers=postnet_layers,
                            n_chans=postnet_chans,
                            n_filts=postnet_filts,
                            use_batch_norm=use_batch_norm,
                            dropout_rate=postnet_dropout_rate)
     self.load_state_dict(
         torch.load(os.path.join("Models", "FastSpeech2_Elizabeth",
                                 "best.pt"),
                    map_location='cpu')["model"])
    def __init__(
        self,  # network structure related
        idim,
        odim,
        embed_dim=0,
        eprenet_conv_layers=0,
        eprenet_conv_chans=0,
        eprenet_conv_filts=0,
        dprenet_layers=2,
        dprenet_units=256,
        elayers=6,
        eunits=1024,
        adim=512,
        aheads=4,
        dlayers=6,
        dunits=1024,
        postnet_layers=5,
        postnet_chans=256,
        postnet_filts=5,
        positionwise_layer_type="conv1d",
        positionwise_conv_kernel_size=1,
        use_scaled_pos_enc=True,
        use_batch_norm=True,
        encoder_normalize_before=True,
        decoder_normalize_before=True,
        encoder_concat_after=True,  # True according to https://github.com/soobinseo/Transformer-TTS
        decoder_concat_after=True,  # True according to https://github.com/soobinseo/Transformer-TTS
        reduction_factor=1,
        spk_embed_dim=None,
        spk_embed_integration_type="concat",  # training related
        transformer_enc_dropout_rate=0.1,
        transformer_enc_positional_dropout_rate=0.1,
        transformer_enc_attn_dropout_rate=0.1,
        transformer_dec_dropout_rate=0.1,
        transformer_dec_positional_dropout_rate=0.1,
        transformer_dec_attn_dropout_rate=0.1,
        transformer_enc_dec_attn_dropout_rate=0.1,
        eprenet_dropout_rate=0.0,
        dprenet_dropout_rate=0.5,
        postnet_dropout_rate=0.5,
        init_type="xavier_uniform",  # since we have little to no
        # asymetric activations, this seems to work better than kaiming
        init_enc_alpha=1.0,
        use_masking=False,  # either this or weighted masking, not both
        use_weighted_masking=True,  # if there are severely different sized samples in one batch
        bce_pos_weight=7.0,  # scaling the loss of the stop token prediction
        loss_type="L1",
        use_guided_attn_loss=True,
        num_heads_applied_guided_attn=2,
        num_layers_applied_guided_attn=2,
        modules_applied_guided_attn=("encoder-decoder", ),
        guided_attn_loss_sigma=0.4,  # standard deviation from diagonal that is allowed
        guided_attn_loss_lambda=25.0):
        super().__init__()
        self.idim = idim
        self.odim = odim
        self.eos = idim - 1
        self.spk_embed_dim = spk_embed_dim
        self.reduction_factor = reduction_factor
        self.use_guided_attn_loss = use_guided_attn_loss
        self.use_scaled_pos_enc = use_scaled_pos_enc
        self.use_guided_attn_loss = use_guided_attn_loss
        if self.use_guided_attn_loss:
            if num_layers_applied_guided_attn == -1:
                self.num_layers_applied_guided_attn = elayers
            else:
                self.num_layers_applied_guided_attn = num_layers_applied_guided_attn
            if num_heads_applied_guided_attn == -1:
                self.num_heads_applied_guided_attn = aheads
            else:
                self.num_heads_applied_guided_attn = num_heads_applied_guided_attn
            self.modules_applied_guided_attn = modules_applied_guided_attn
        if self.spk_embed_dim is not None:
            self.spk_embed_integration_type = spk_embed_integration_type
        self.padding_idx = 0
        pos_enc_class = (ScaledPositionalEncoding
                         if self.use_scaled_pos_enc else PositionalEncoding)
        if eprenet_conv_layers != 0:
            encoder_input_layer = torch.nn.Sequential(
                EncoderPrenet(idim=idim,
                              embed_dim=embed_dim,
                              elayers=0,
                              econv_layers=eprenet_conv_layers,
                              econv_chans=eprenet_conv_chans,
                              econv_filts=eprenet_conv_filts,
                              use_batch_norm=use_batch_norm,
                              dropout_rate=eprenet_dropout_rate,
                              padding_idx=self.padding_idx),
                torch.nn.Linear(eprenet_conv_chans, adim))
        else:
            encoder_input_layer = torch.nn.Embedding(
                num_embeddings=idim,
                embedding_dim=adim,
                padding_idx=self.padding_idx)
        self.encoder = Encoder(
            idim=idim,
            attention_dim=adim,
            attention_heads=aheads,
            linear_units=eunits,
            num_blocks=elayers,
            input_layer=encoder_input_layer,
            dropout_rate=transformer_enc_dropout_rate,
            positional_dropout_rate=transformer_enc_positional_dropout_rate,
            attention_dropout_rate=transformer_enc_attn_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalize_before=encoder_normalize_before,
            concat_after=encoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size)
        if self.spk_embed_dim is not None:
            self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)

        decoder_input_layer = torch.nn.Sequential(
            DecoderPrenet(idim=odim,
                          n_layers=dprenet_layers,
                          n_units=dprenet_units,
                          dropout_rate=dprenet_dropout_rate),
            torch.nn.Linear(dprenet_units, adim))
        self.decoder = Decoder(
            odim=odim,
            attention_dim=adim,
            attention_heads=aheads,
            linear_units=dunits,
            num_blocks=dlayers,
            dropout_rate=transformer_dec_dropout_rate,
            positional_dropout_rate=transformer_dec_positional_dropout_rate,
            self_attention_dropout_rate=transformer_dec_attn_dropout_rate,
            src_attention_dropout_rate=transformer_enc_dec_attn_dropout_rate,
            input_layer=decoder_input_layer,
            use_output_layer=False,
            pos_enc_class=pos_enc_class,
            normalize_before=decoder_normalize_before,
            concat_after=decoder_concat_after)
        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
        self.prob_out = torch.nn.Linear(adim, reduction_factor)
        self.postnet = PostNet(idim=idim,
                               odim=odim,
                               n_layers=postnet_layers,
                               n_chans=postnet_chans,
                               n_filts=postnet_filts,
                               use_batch_norm=use_batch_norm,
                               dropout_rate=postnet_dropout_rate)
        if self.use_guided_attn_loss:
            self.attn_criterion = GuidedMultiHeadAttentionLoss(
                sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda)
        self.criterion = TransformerLoss(
            use_masking=use_masking,
            use_weighted_masking=use_weighted_masking,
            bce_pos_weight=bce_pos_weight)
        if self.use_guided_attn_loss:
            self.attn_criterion = GuidedMultiHeadAttentionLoss(
                sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda)
        self.load_state_dict(
            torch.load(os.path.join("Models", "TransformerTTS_Eva", "best.pt"),
                       map_location='cpu')["model"])