def __init__(self, num_mel=80, embedding_size=512): super(TransformerTTS, self).__init__() self.encoder = Encoder() self.decoder = Decoder() self.postnet = PostNet() self.stop_linear = Linear(embedding_size, 1, w_init='sigmoid') self.mel_linear = Linear(embedding_size, num_mel)
def __init__(self, use_postnet=True, n_spkers=1): super(FastSpeech2, self).__init__() ### Speaker Embedding Table ### self.use_spk_embed = hp.use_spk_embed if self.use_spk_embed: self.n_spkers = n_spkers self.spk_embed_dim = hp.spk_embed_dim self.spk_embed_weight_std = hp.spk_embed_weight_std self.embed_speakers = Embedding(n_spkers, self.spk_embed_dim, padding_idx=None, std=self.spk_embed_weight_std) self.use_emo_embed = hp.use_emo_embed if self.use_emo_embed: self.n_emotes = n_emotes self.emo_embed_dim = hp.emo_embed_dim self.emo_embed_weight_std = hp.emo_embed_weight_std self.embed_emotions = Embedding(n_emotes, self.emo_embed_dim, padding_idx=None, std=self.emo_embed_weight_std) ### Encoder, Speaker Integrator, Variance Adaptor, Deocder, Postnet ### self.encoder = Encoder() if self.use_spk_embed: self.speaker_integrator = SpeakerIntegrator() self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet()
def __init__(self): super(FastSpeech2, self).__init__() self.encoder = Encoder() self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() self.mel_linear = Linear(hp.decoder_hidden, hp.n_mel_channels) self.postnet = PostNet()
def __init__(self): super(FastSpeech, self).__init__() self.encoder = Encoder() self.length_regulator = LengthRegulator() self.decoder = Decoder() self.mel_linear = Linear(hp.decoder_output_size, hp.num_mels) self.postnet = PostNet()
def __init__(self, use_postnet=True): super(STYLER, self).__init__() self.style_modeling = StyleModeling() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet() encoder_output = None
def __init__(self, use_postnet=True): super(FastSpeech2, self).__init__() self.encoder = Encoder() self.variance_adaptor = TacotronDuration() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet()
def __init__(self, use_postnet=True): super(FastSpeech2, self).__init__() # self.gst = GST() self.encoder = Encoder() self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() if hp.vocoder=='WORLD': # self.f0_decoder= Decoder() self.ap_linear = nn.Linear(hp.decoder_hidden, hp.n_ap_channels) self.sp_linear = nn.Linear(hp.decoder_hidden, hp.n_sp_channels) else: self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet()