Beispiel #1
0
    def forward(
        self,
        speakers,
        texts,
        src_lens,
        max_src_len,
        mels=None,
        mel_lens=None,
        max_mel_len=None
    ):
        src_masks = get_mask_from_lengths(src_lens, max_src_len)
        mel_masks = (
            get_mask_from_lengths(mel_lens, max_mel_len)
            if mel_lens is not None
            else None
        )

        # Encoder
        text_encoding = self.text_encoder(texts, src_masks)
        if self.speaker_emb is not None:
            speaker_embedding = self.speaker_emb(speakers)

        residual_encoding, attns, mus, log_vars = self.residual_encoder(
            mels, text_encoding, mel_masks, src_masks, max_mel_len, max_src_len, speaker_embedding
        )
        
        speaker_embedding = speaker_embedding.unsqueeze(1).expand(
            -1, max_src_len, -1
        )
        encodings = torch.cat([text_encoding, residual_encoding, speaker_embedding], dim=-1)

        # Duration Modeling
        durations, V = self.duration_predictor(encodings, src_masks)

        upsampled_rep, mel_masks, mel_lens, Ws = \
            self.learned_upsampling(durations, V, src_lens, src_masks, max_src_len)

        # Decoder
        mel_iters, mel_masks = self.decoder(upsampled_rep, mel_masks)

        return (
            mel_iters,
            mel_masks,
            mel_lens,
            src_masks,
            src_lens,
            durations,
            mus,
            log_vars,
            attns,
            Ws,
        )
    def forward(self, duration, V, src_len, src_mask, max_src_len):

        batch_size = duration.shape[0]

        # Duration Interpretation
        mel_len = torch.round(duration.sum(-1)).type(
            torch.LongTensor).to(device)
        mel_len = torch.clamp(mel_len, max=self.max_seq_len)
        max_mel_len = mel_len.max().item()
        mel_mask = get_mask_from_lengths(mel_len, max_mel_len)

        # Prepare Attention Mask
        src_mask_ = src_mask.unsqueeze(1).expand(-1, mel_mask.shape[1],
                                                 -1)  # [B, tat_len, src_len]
        mel_mask_ = mel_mask.unsqueeze(-1).expand(
            -1, -1, src_mask.shape[1])  # [B, tgt_len, src_len]
        attn_mask = torch.zeros((src_mask.shape[0], mel_mask.shape[1],
                                 src_mask.shape[1])).to(device)
        attn_mask = attn_mask.masked_fill(src_mask_, 1.)
        attn_mask = attn_mask.masked_fill(mel_mask_, 1.)
        attn_mask = attn_mask.bool()

        # Token Boundary Grid
        e_k = torch.cumsum(duration, dim=1)
        s_k = e_k - duration
        e_k = e_k.unsqueeze(1).expand(batch_size, max_mel_len, -1)
        s_k = s_k.unsqueeze(1).expand(batch_size, max_mel_len, -1)
        t_arange = torch.arange(
            1, max_mel_len + 1,
            device=device).unsqueeze(0).unsqueeze(-1).expand(
                batch_size, -1, max_src_len)
        S, E = (t_arange - s_k).masked_fill(attn_mask,
                                            0), (e_k - t_arange).masked_fill(
                                                attn_mask, 0)

        # Attention (W)
        W = self.swish_w(S, E, self.conv_w(V))  # [B, T, K, dim_w]
        W = self.linear_w(W).squeeze(-1).masked_fill(src_mask_,
                                                     -np.inf)  #[B, T, K]
        W = self.softmax_w(W)  #[B, T, K]
        W = W.masked_fill(mel_mask_, 0.)

        # Auxiliary Attention Context (C)
        C = self.swish_c(S, E, self.conv_c(V))  # [B, T, K, dim_c]

        # Upsampled Representation (O)
        upsampled_rep = torch.matmul(W, V) + self.linear_einsum(
            torch.einsum('btk,btkp->btp', W, C))  # [B, T, M]
        upsampled_rep = self.layer_norm(upsampled_rep)
        upsampled_rep = upsampled_rep.masked_fill(mel_mask.unsqueeze(-1), 0)

        return upsampled_rep, mel_mask, mel_len, W
Beispiel #3
0
    def forward(self, log_duration, V, src_len, src_mask, max_src_len):

        batch_size = log_duration.shape[0]

        # Log Duration Interpretation
        log_duration = torch.clamp(log_duration,
                                   max=math.log(self.max_seq_len))
        duration = torch.maximum(
            torch.exp(log_duration) - 1,
            torch.ones_like(log_duration))  # prior: at least 1 frame in total
        mel_len = torch.round(duration.sum(-1)).type(
            torch.LongTensor).to(device)
        mel_len = torch.clamp(mel_len, max=self.max_seq_len)
        max_mel_len = mel_len.max().item()
        mel_mask = get_mask_from_lengths(mel_len, max_mel_len)

        # Prepare Attention Mask
        src_mask_ = src_mask.float().unsqueeze(-1)  # [B, src_len, 1]
        mel_mask_ = mel_mask.float().unsqueeze(-1)  # [B, tgt_len, 1]
        attn_mask = torch.bmm(mel_mask_, src_mask_.transpose(
            -2, -1)).bool()  # [B, tgt_len, src_len]

        # Token Boundary Grid
        e_k = torch.cumsum(duration, dim=1)
        s_k = e_k - duration
        e_k = e_k.unsqueeze(1).expand(batch_size, max_mel_len, -1)
        s_k = s_k.unsqueeze(1).expand(batch_size, max_mel_len, -1)
        t_arange = torch.arange(
            1, max_mel_len + 1,
            device=device).unsqueeze(0).unsqueeze(-1).expand(
                batch_size, -1, max_src_len)
        S, E = (t_arange - s_k).masked_fill(attn_mask,
                                            0), (e_k - t_arange).masked_fill(
                                                attn_mask, 0)

        # Attention (W)
        W = self.swish_w(S, E, self.conv_w(V))  # [B, T, K, dim_w]
        W = self.linear_w(W).squeeze(-1).masked_fill(attn_mask, -np.inf)
        W = self.softmax_w(W)

        # Auxiliary Attention Context (C)
        C = self.swish_c(S, E, self.conv_c(V))  # [B, T, K, dim_c]

        # Upsampled Representation (O)
        upsampled_rep = torch.matmul(W, V) + self.linear_einsum(
            torch.einsum('btk,btkp->btp', W, C))  # [B, T, M]
        upsampled_rep = self.layer_norm(upsampled_rep)
        upsampled_rep = upsampled_rep.masked_fill(mel_mask.unsqueeze(-1), 0)

        return upsampled_rep, mel_mask, mel_len
Beispiel #4
0
    def forward(
        self,
        speakers,
        texts,
        src_lens,
        max_src_len,
        mels=None,
        mel_lens=None,
        max_mel_len=None,
        p_targets=None,
        e_targets=None,
        d_targets=None,
        p_control=1.0,
        e_control=1.0,
        d_control=1.0,
    ):
        src_masks = get_mask_from_lengths(src_lens, max_src_len)
        mel_masks = (
            get_mask_from_lengths(mel_lens, max_mel_len)
            if mel_lens is not None
            else None
        )

        output = self.encoder(texts, src_masks)

        if self.speaker_emb is not None:
            output = output + self.speaker_emb(speakers).unsqueeze(1).expand(
                -1, max_src_len, -1
            )

        (
            output,
            p_predictions,
            e_predictions,
            log_d_predictions,
            d_rounded,
            mel_lens,
            mel_masks,
        ) = self.variance_adaptor(
            output,
            src_masks,
            mel_masks,
            max_mel_len,
            p_targets,
            e_targets,
            d_targets,
            p_control,
            e_control,
            d_control,
        )

        output, mel_masks = self.decoder(output, mel_masks)
        output = self.mel_linear(output)

        postnet_output = self.postnet(output) + output

        return (
            output,
            postnet_output,
            p_predictions,
            e_predictions,
            log_d_predictions,
            d_rounded,
            src_masks,
            mel_masks,
            src_lens,
            mel_lens,
        )