コード例 #1
0
ファイル: model.py プロジェクト: keonlee9420/tacotron2_MMI
    def forward(self, inputs):
        text_inputs, text_lengths, r_len_pad, mels, max_len, output_lengths, *_ = inputs
        text_lengths, output_lengths = text_lengths.data, output_lengths.data

        if self.drop_frame_rate > 0. and self.training:
            # mels shape (B, n_mel_channels, T_out),
            mels = dropout_frame(mels, self.global_mean, output_lengths,
                                 self.drop_frame_rate, r_len_pad)

        embedded_inputs = self.embedding(text_inputs).transpose(1, 2)

        encoder_outputs = self.encoder(embedded_inputs, text_lengths)

        (decoder_outputs, mel_outputs, gate_outputs,
         alignments) = self.decoder(encoder_outputs,
                                    mels,
                                    memory_lengths=text_lengths)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output([
            decoder_outputs, mel_outputs, mel_outputs_postnet, gate_outputs,
            alignments
        ], output_lengths)
コード例 #2
0
ファイル: model.py プロジェクト: CookiePPP/codedump
    def forward(self,
                inputs,
                teacher_force_till=None,
                p_teacher_forcing=None,
                drop_frame_rate=None):
        text, text_lengths, gt_mels, max_len, output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states, align_padded, f0_padded = inputs
        text_lengths, output_lengths = text_lengths.data, output_lengths.data

        if teacher_force_till == None:
            p_teacher_forcing = self.p_teacher_forcing
        if p_teacher_forcing == None:
            teacher_force_till = self.teacher_force_till
        if drop_frame_rate == None: drop_frame_rate = self.drop_frame_rate

        if drop_frame_rate > 0. and self.training:
            # gt_mels shape (B, n_mel_channels, T_out),
            gt_mels = dropout_frame(gt_mels, self.global_mean, output_lengths,
                                    drop_frame_rate)

        embedded_text = self.embedding(text).transpose(
            1, 2)  # [B, embed, sequence]
        encoder_outputs = self.encoder(
            embedded_text, text_lengths,
            speaker_ids=speaker_ids)  # [B, time, encoder_out]

        if hasattr(self, "gst"):
            embedded_gst = self.gst(
                gt_mels if (torchmoji_hidden is None) else torchmoji_hidden,
                ref_mode=self.ref_mode
            )  # create embedding from tokens from reference mel
            embedded_gst = embedded_gst.repeat(
                1, encoder_outputs.size(1), 1
            )  # repeat token along-side the other embeddings for input to decoder
            encoder_outputs = torch.cat((encoder_outputs, embedded_gst),
                                        dim=2)  # [batch, time, encoder_out]

        if hasattr(self, "speaker_embedding"):
            encoder_outputs = torch.cat(
                (encoder_outputs,
                 self.speaker_embedding(speaker_ids)[:, None].repeat(
                     1, encoder_outputs.size(1), 1)),
                dim=2)  #concat embedded speakers

        mel_outputs, gate_outputs, alignments = self.decoder(
            encoder_outputs,
            gt_mels,
            memory_lengths=text_lengths,
            f0s=f0_padded,
            preserve_decoder=preserve_decoder_states,
            teacher_force_till=teacher_force_till,
            p_teacher_forcing=p_teacher_forcing)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
            output_lengths, text_lengths)
コード例 #3
0
    def forward(self,
                inputs,
                teacher_force_till=None,
                p_teacher_forcing=None,
                drop_frame_rate=None):
        inputs, input_lengths, mels, max_len, output_lengths, speaker_ids = inputs
        input_lengths, output_lengths = input_lengths.data, output_lengths.data

        if teacher_force_till == None:
            p_teacher_forcing = self.p_teacher_forcing
        if p_teacher_forcing == None:
            teacher_force_till = self.teacher_force_till
        if drop_frame_rate == None: drop_frame_rate = self.drop_frame_rate

        if drop_frame_rate > 0. and self.training:
            # mels shape (B, n_mel_channels, T_out),
            mels = dropout_frame(mels, self.global_mean, output_lengths,
                                 drop_frame_rate)

        embedded_inputs = self.embedding(inputs).transpose(
            1, 2)  # [B, embed, sequence]
        if self.encoder_speaker_embed_dim and self.encoder_concat_speaker_embed == 'before':
            encoder_embedded_speakers = self.encoder_speaker_embedding(
                speaker_ids)[:, None].transpose(1, 2)  # [B, embed, sequence]
            encoder_embedded_speakers = encoder_embedded_speakers.repeat(
                1, 1, embedded_inputs.size(2))
            embedded_inputs = torch.cat(
                (embedded_inputs, encoder_embedded_speakers),
                dim=1)  # [B, embed, sequence]
            embedded_text = self.encoder(
                embedded_inputs, input_lengths)  # [B, time, encoder_out]
        elif self.encoder_speaker_embed_dim and self.encoder_concat_speaker_embed == 'inside':
            encoder_embedded_speakers = self.encoder_speaker_embedding(
                speaker_ids)[:, None].transpose(1, 2)  # [B, embed, sequence]
            encoder_embedded_speakers = encoder_embedded_speakers.repeat(
                1, 1, embedded_inputs.size(2))
            embedded_text = self.encoder(
                embedded_inputs,
                input_lengths,
                speaker_embedding=encoder_embedded_speakers
            )  # [B, time, encoder_out]
        else:
            embedded_text = self.encoder(
                embedded_inputs, input_lengths)  # [B, time, encoder_out]

        if self.speaker_embedding_dim:
            embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
            embedded_speakers = embedded_speakers.repeat(
                1, embedded_text.size(1), 1)

        if self.with_gst:
            embedded_gst = self.gst(
                speaker_ids
                if self.drop_tokens_mode == 'speaker_embedding' else mels,
                ref_mode=self.ref_mode
            )  # create embedding from tokens from reference mel
            embedded_gst = embedded_gst.repeat(
                1, embedded_text.size(1), 1
            )  # repeat token along-side the other embeddings for input to decoder

        if self.with_gst and not self.speaker_embedding_dim:
            encoder_outputs = torch.cat((embedded_text, embedded_gst),
                                        dim=2)  # [batch, time, encoder_out]
        elif self.with_gst and self.speaker_embedding_dim:
            encoder_outputs = torch.cat(
                (embedded_text, embedded_gst, embedded_speakers),
                dim=2)  # [batch, time, encoder_out]
        elif not self.with_gst and self.speaker_embedding_dim:
            encoder_outputs = torch.cat((embedded_text, embedded_speakers),
                                        dim=2)  # [batch, time, encoder_out]

        if self.speaker_embedding_dim:
            encoder_outputs = torch.cat(
                (embedded_text, embedded_gst, embedded_speakers),
                dim=2)  # [batch, time, encoder_out]
        else:
            encoder_outputs = torch.cat((embedded_text, embedded_gst),
                                        dim=2)  # [batch, time, encoder_out]

        mel_outputs, gate_outputs, alignments = self.decoder(
            encoder_outputs,
            mels,
            memory_lengths=input_lengths,
            teacher_force_till=teacher_force_till,
            p_teacher_forcing=p_teacher_forcing)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
            output_lengths)