def __init__( self, config: "TacotronConfig", ap: "AudioProcessor" = None, tokenizer: "TTSTokenizer" = None, speaker_manager: SpeakerManager = None, ): super().__init__(config, ap, tokenizer, speaker_manager) # pass all config fields to `self` # for fewer code change for key in config: setattr(self, key, config[key]) # set speaker embedding channel size for determining `in_channels` for the connected layers. # `init_multispeaker` needs to be called once more in training to initialize the speaker embedding layer based # on the number of speakers infered from the dataset. if self.use_speaker_embedding or self.use_d_vector_file: self.init_multispeaker(config) self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim if self.use_gst: self.decoder_in_features += self.gst.gst_embedding_dim if self.use_capacitron_vae: self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim # embedding layer self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, self.r, self.memory_size, self.attention_type, self.windowing, self.attention_norm, self.prenet_type, self.prenet_dropout, self.use_forward_attn, self.transition_agent, self.forward_attn_mask, self.location_attn, self.attention_heads, self.separate_stopnet, self.max_decoder_steps, ) self.postnet = PostCBHG(self.decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, self.out_channels) # setup prenet dropout self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers if self.gst and self.use_gst: self.gst_layer = GST( num_mel=self.decoder_output_dim, num_heads=self.gst.gst_num_heads, num_style_tokens=self.gst.gst_num_style_tokens, gst_embedding_dim=self.gst.gst_embedding_dim, ) # Capacitron layers if self.capacitron_vae and self.use_capacitron_vae: self.capacitron_vae_layer = CapacitronVAE( num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae. capacitron_VAE_embedding_dim, speaker_embedding_dim=self.embedded_speaker_dim if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding else None, text_summary_embedding_dim=self.capacitron_vae. capacitron_text_summary_embedding_dim if self.capacitron_vae.capacitron_use_text_summary_embeddings else None, ) # backward pass decoder if self.bidirectional_decoder: self._init_backward_decoder() # setup DDC if self.double_decoder_consistency: self.coarse_decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, self.ddc_r, self.memory_size, self.attention_type, self.windowing, self.attention_norm, self.prenet_type, self.prenet_dropout, self.use_forward_attn, self.transition_agent, self.forward_attn_mask, self.location_attn, self.attention_heads, self.separate_stopnet, self.max_decoder_steps, )
def __init__(self, num_chars, num_speakers, r=5, postnet_output_dim=1025, decoder_output_dim=80, attn_type='original', attn_win=False, attn_norm="sigmoid", prenet_type="original", prenet_dropout=True, forward_attn=False, trans_agent=False, forward_attn_mask=False, location_attn=True, attn_K=5, separate_stopnet=True, bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, encoder_in_features=256, decoder_in_features=256, speaker_embedding_dim=None, gst=False, gst_embedding_dim=256, gst_num_heads=4, gst_style_tokens=10, memory_size=5, gst_use_speaker_embedding=False): super(Tacotron, self).__init__( num_chars, num_speakers, r, postnet_output_dim, decoder_output_dim, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, ddc_r, encoder_in_features, decoder_in_features, speaker_embedding_dim, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens, gst_use_speaker_embedding) # speaker embedding layers if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 256 self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet) self.postnet = PostCBHG(decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) # global style token layers if self.gst: self.gst_layer = GST(num_mel=80, num_heads=gst_num_heads, num_style_tokens=gst_style_tokens, gst_embedding_dim=self.gst_embedding_dim, speaker_embedding_dim=speaker_embedding_dim if self.embeddings_per_sample and self.gst_use_speaker_embedding else None) # backward pass decoder if self.bidirectional_decoder: self._init_backward_decoder() # setup DDC if self.double_decoder_consistency: self.coarse_decoder = Decoder( self.decoder_in_features, decoder_output_dim, ddc_r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet)