def forward(self, inputs): text_inputs, text_lengths, r_len_pad, mels, max_len, output_lengths, *_ = inputs text_lengths, output_lengths = text_lengths.data, output_lengths.data if self.drop_frame_rate > 0. and self.training: # mels shape (B, n_mel_channels, T_out), mels = dropout_frame(mels, self.global_mean, output_lengths, self.drop_frame_rate, r_len_pad) embedded_inputs = self.embedding(text_inputs).transpose(1, 2) encoder_outputs = self.encoder(embedded_inputs, text_lengths) (decoder_outputs, mel_outputs, gate_outputs, alignments) = self.decoder(encoder_outputs, mels, memory_lengths=text_lengths) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return self.parse_output([ decoder_outputs, mel_outputs, mel_outputs_postnet, gate_outputs, alignments ], output_lengths)
def forward(self, inputs, teacher_force_till=None, p_teacher_forcing=None, drop_frame_rate=None): text, text_lengths, gt_mels, max_len, output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states, align_padded, f0_padded = inputs text_lengths, output_lengths = text_lengths.data, output_lengths.data if teacher_force_till == None: p_teacher_forcing = self.p_teacher_forcing if p_teacher_forcing == None: teacher_force_till = self.teacher_force_till if drop_frame_rate == None: drop_frame_rate = self.drop_frame_rate if drop_frame_rate > 0. and self.training: # gt_mels shape (B, n_mel_channels, T_out), gt_mels = dropout_frame(gt_mels, self.global_mean, output_lengths, drop_frame_rate) embedded_text = self.embedding(text).transpose( 1, 2) # [B, embed, sequence] encoder_outputs = self.encoder( embedded_text, text_lengths, speaker_ids=speaker_ids) # [B, time, encoder_out] if hasattr(self, "gst"): embedded_gst = self.gst( gt_mels if (torchmoji_hidden is None) else torchmoji_hidden, ref_mode=self.ref_mode ) # create embedding from tokens from reference mel embedded_gst = embedded_gst.repeat( 1, encoder_outputs.size(1), 1 ) # repeat token along-side the other embeddings for input to decoder encoder_outputs = torch.cat((encoder_outputs, embedded_gst), dim=2) # [batch, time, encoder_out] if hasattr(self, "speaker_embedding"): encoder_outputs = torch.cat( (encoder_outputs, self.speaker_embedding(speaker_ids)[:, None].repeat( 1, encoder_outputs.size(1), 1)), dim=2) #concat embedded speakers mel_outputs, gate_outputs, alignments = self.decoder( encoder_outputs, gt_mels, memory_lengths=text_lengths, f0s=f0_padded, preserve_decoder=preserve_decoder_states, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], output_lengths, text_lengths)
def forward(self, inputs, teacher_force_till=None, p_teacher_forcing=None, drop_frame_rate=None): inputs, input_lengths, mels, max_len, output_lengths, speaker_ids = inputs input_lengths, output_lengths = input_lengths.data, output_lengths.data if teacher_force_till == None: p_teacher_forcing = self.p_teacher_forcing if p_teacher_forcing == None: teacher_force_till = self.teacher_force_till if drop_frame_rate == None: drop_frame_rate = self.drop_frame_rate if drop_frame_rate > 0. and self.training: # mels shape (B, n_mel_channels, T_out), mels = dropout_frame(mels, self.global_mean, output_lengths, drop_frame_rate) embedded_inputs = self.embedding(inputs).transpose( 1, 2) # [B, embed, sequence] if self.encoder_speaker_embed_dim and self.encoder_concat_speaker_embed == 'before': encoder_embedded_speakers = self.encoder_speaker_embedding( speaker_ids)[:, None].transpose(1, 2) # [B, embed, sequence] encoder_embedded_speakers = encoder_embedded_speakers.repeat( 1, 1, embedded_inputs.size(2)) embedded_inputs = torch.cat( (embedded_inputs, encoder_embedded_speakers), dim=1) # [B, embed, sequence] embedded_text = self.encoder( embedded_inputs, input_lengths) # [B, time, encoder_out] elif self.encoder_speaker_embed_dim and self.encoder_concat_speaker_embed == 'inside': encoder_embedded_speakers = self.encoder_speaker_embedding( speaker_ids)[:, None].transpose(1, 2) # [B, embed, sequence] encoder_embedded_speakers = encoder_embedded_speakers.repeat( 1, 1, embedded_inputs.size(2)) embedded_text = self.encoder( embedded_inputs, input_lengths, speaker_embedding=encoder_embedded_speakers ) # [B, time, encoder_out] else: embedded_text = self.encoder( embedded_inputs, input_lengths) # [B, time, encoder_out] if self.speaker_embedding_dim: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] embedded_speakers = embedded_speakers.repeat( 1, embedded_text.size(1), 1) if self.with_gst: embedded_gst = self.gst( speaker_ids if self.drop_tokens_mode == 'speaker_embedding' else mels, ref_mode=self.ref_mode ) # create embedding from tokens from reference mel embedded_gst = embedded_gst.repeat( 1, embedded_text.size(1), 1 ) # repeat token along-side the other embeddings for input to decoder if self.with_gst and not self.speaker_embedding_dim: encoder_outputs = torch.cat((embedded_text, embedded_gst), dim=2) # [batch, time, encoder_out] elif self.with_gst and self.speaker_embedding_dim: encoder_outputs = torch.cat( (embedded_text, embedded_gst, embedded_speakers), dim=2) # [batch, time, encoder_out] elif not self.with_gst and self.speaker_embedding_dim: encoder_outputs = torch.cat((embedded_text, embedded_speakers), dim=2) # [batch, time, encoder_out] if self.speaker_embedding_dim: encoder_outputs = torch.cat( (embedded_text, embedded_gst, embedded_speakers), dim=2) # [batch, time, encoder_out] else: encoder_outputs = torch.cat((embedded_text, embedded_gst), dim=2) # [batch, time, encoder_out] mel_outputs, gate_outputs, alignments = self.decoder( encoder_outputs, mels, memory_lengths=input_lengths, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], output_lengths)