Ejemplo n.º 1
0
    def test_in_out(self):
        # test input == target
        layer = L1LossMasked()
        dummy_input = T.ones(4, 8, 128).float()
        dummy_target = T.ones(4, 8, 128).float()
        dummy_length = (T.ones(4) * 8).long()
        output = layer(dummy_input, dummy_target, dummy_length)
        assert output.item() == 0.0

        # test input != target
        dummy_input = T.ones(4, 8, 128).float()
        dummy_target = T.zeros(4, 8, 128).float()
        dummy_length = (T.ones(4) * 8).long()
        output = layer(dummy_input, dummy_target, dummy_length)
        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])

        # test if padded values of input makes any difference
        dummy_input = T.ones(4, 8, 128).float()
        dummy_target = T.zeros(4, 8, 128).float()
        dummy_length = (T.arange(5, 9)).long()
        mask = ((sequence_mask(dummy_length).float() - 1.0) *
                100.0).unsqueeze(2)
        output = layer(dummy_input + mask, dummy_target, dummy_length)
        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])

        dummy_input = T.rand(4, 8, 128).float()
        dummy_target = dummy_input.detach()
        dummy_length = (T.arange(5, 9)).long()
        mask = ((sequence_mask(dummy_length).float() - 1.0) *
                100.0).unsqueeze(2)
        output = layer(dummy_input + mask, dummy_target, dummy_length)
        assert output.item() == 0, "0 vs {}".format(output.data[0])
Ejemplo n.º 2
0
 def compute_masks(self, text_lengths, mel_lengths):
     """Compute masks  against sequence paddings."""
     # B x T_in_max (boolean)
     device = text_lengths.device
     input_mask = sequence_mask(text_lengths).to(device)
     output_mask = None
     if mel_lengths is not None:
         max_len = mel_lengths.max()
         r = self.decoder.r
         max_len = max_len + (r -
                              (max_len % r)) if max_len % r > 0 else max_len
         output_mask = sequence_mask(mel_lengths,
                                     max_len=max_len).to(device)
     return input_mask, output_mask
Ejemplo n.º 3
0
 def forward(self, x, target, length):
     """
     Args:
         x: A Variable containing a FloatTensor of size
             (batch, max_len, dim) which contains the
             unnormalized probability for each class.
         target: A Variable containing a LongTensor of size
             (batch, max_len, dim) which contains the index of the true
             class for each corresponding step.
         length: A Variable containing a LongTensor of size (batch,)
             which contains the length of each data in a batch.
     Returns:
         loss: An average loss value in range [0, 1] masked by the length.
     """
     # mask: (batch, max_len, 1)
     target.requires_grad = False
     mask = sequence_mask(
         sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
     if self.seq_len_norm:
         norm_w = mask / mask.sum(dim=1, keepdim=True)
         out_weights = norm_w.div(target.shape[0] * target.shape[2])
         mask = mask.expand_as(x)
         loss = functional.mse_loss(
             x * mask, target * mask, reduction='none')
         loss = loss.mul(out_weights.to(loss.device)).sum()
     else:
         mask = mask.expand_as(x)
         loss = functional.mse_loss(
             x * mask, target * mask, reduction='sum')
         loss = loss / mask.sum()
     return loss
Ejemplo n.º 4
0
 def forward(self, x, target, length):
     """
     Args:
         x: A Variable containing a FloatTensor of size
             (batch, max_len) which contains the
             unnormalized probability for each class.
         target: A Variable containing a LongTensor of size
             (batch, max_len) which contains the index of the true
             class for each corresponding step.
         length: A Variable containing a LongTensor of size (batch,)
             which contains the length of each data in a batch.
     Returns:
         loss: An average loss value in range [0, 1] masked by the length.
     """
     # mask: (batch, max_len, 1)
     target.requires_grad = False
     mask = sequence_mask(sequence_length=length,
                          max_len=target.size(1)).float()
     loss = functional.binary_cross_entropy_with_logits(
         x * mask,
         target * mask,
         pos_weight=self.pos_weight,
         reduction='sum')
     loss = loss / mask.sum()
     return loss
Ejemplo n.º 5
0
    def forward(self, text, text_lengths, mel_specs=None, speaker_ids=None,ref_cond=True):
        self._init_states()
        # compute mask for padding
        mask = sequence_mask(text_lengths).to(text.device)
        embedded_inputs = self.embedding(text).transpose(1, 2)
        encoder_outputs = self.encoder(embedded_inputs, text_lengths)
        encoder_outputs = self._add_speaker_embedding(encoder_outputs,
                                                      speaker_ids)

        if ref_cond:
            prosody_outputs, mu, logvar, z  = self.vae_gst(mel_specs)
            prosody_outputs = prosody_outputs.unsqueeze(1).expand_as(encoder_outputs)
            encoder_outputs =encoder_outputs+prosody_outputs

        decoder_outputs, alignments, stop_tokens = self.decoder(
            encoder_outputs, mel_specs, mask)
        postnet_outputs = self.postnet(decoder_outputs)
        postnet_outputs = decoder_outputs + postnet_outputs
        decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
            decoder_outputs, postnet_outputs, alignments)

        if self.bidirectional_decoder:
            decoder_outputs_backward, alignments_backward = self._backward_inference(mel_specs, encoder_outputs, mask)
            if ref_con:
                return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward, mu, logvar, z
            else:
                return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward                    
        else:
            if ref_cond:
                return decoder_outputs, postnet_outputs, alignments, stop_tokens, mu, logvar, z
            else:
                return decoder_outputs, postnet_outputs, alignments, stop_tokens
Ejemplo n.º 6
0
 def forward(self, characters, text_lengths, mel_specs):
     B = characters.size(0)
     mask = sequence_mask(text_lengths).to(characters.device)
     inputs = self.embedding(characters)
     encoder_outputs = self.encoder(inputs)
     mel_outputs, alignments, stop_tokens = self.decoder(
         encoder_outputs, mel_specs, mask)
     mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
     linear_outputs = self.postnet(mel_outputs)
     linear_outputs = self.last_linear(linear_outputs)
     return mel_outputs, linear_outputs, alignments, stop_tokens
Ejemplo n.º 7
0
 def forward(self, text, text_lengths, mel_specs=None):
     # compute mask for padding
     mask = sequence_mask(text_lengths).to(text.device)
     embedded_inputs = self.embedding(text).transpose(1, 2)
     encoder_outputs = self.encoder(embedded_inputs, text_lengths)
     mel_outputs, stop_tokens, alignments = self.decoder(
         encoder_outputs, mel_specs, mask)
     mel_outputs_postnet = self.postnet(mel_outputs)
     mel_outputs_postnet = mel_outputs + mel_outputs_postnet
     mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
         mel_outputs, mel_outputs_postnet, alignments)
     return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
Ejemplo n.º 8
0
 def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
     B = characters.size(0)
     mask = sequence_mask(text_lengths).to(characters.device)
     inputs = self.embedding(characters)
     encoder_outputs = self.encoder(inputs)
     encoder_outputs = self._add_speaker_embedding(encoder_outputs,
                                                   speaker_ids)
     gst_outputs = self.gst(mel_specs)
     gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
     encoder_outputs = encoder_outputs + gst_outputs
     mel_outputs, alignments, stop_tokens = self.decoder(
         encoder_outputs, mel_specs, mask)
     mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
     linear_outputs = self.postnet(mel_outputs)
     linear_outputs = self.last_linear(linear_outputs)
     return mel_outputs, linear_outputs, alignments, stop_tokens
Ejemplo n.º 9
0
 def forward(self, text, text_lengths, mel_specs=None, speaker_ids=None):
     self._init_states()
     # compute mask for padding
     mask = sequence_mask(text_lengths).to(text.device)
     embedded_inputs = self.embedding(text).transpose(1, 2)
     encoder_outputs = self.encoder(embedded_inputs, text_lengths)
     encoder_outputs = self._add_speaker_embedding(encoder_outputs,
                                                   speaker_ids)
     decoder_outputs, alignments, stop_tokens = self.decoder(
         encoder_outputs, mel_specs, mask)
     postnet_outputs = self.postnet(decoder_outputs)
     postnet_outputs = decoder_outputs + postnet_outputs
     decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
         decoder_outputs, postnet_outputs, alignments)
     if self.bidirectional_decoder:
         decoder_outputs_backward, alignments_backward = self._backward_inference(mel_specs, encoder_outputs, mask)
         return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
     return decoder_outputs, postnet_outputs, alignments, stop_tokens
Ejemplo n.º 10
0
 def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
     """
     Shapes:
         - characters: B x T_in
         - text_lengths: B
         - mel_specs: B x T_out x D
         - speaker_ids: B x 1
     """
     self._init_states()
     mask = sequence_mask(text_lengths).to(characters.device)
     # B x T_in x embed_dim
     inputs = self.embedding(characters)
     # B x speaker_embed_dim
     self.compute_speaker_embedding(speaker_ids)
     if self.num_speakers > 1:
         # B x T_in x embed_dim + speaker_embed_dim
         inputs = self._concat_speaker_embedding(inputs,
                                                 self.speaker_embeddings)
     # B x T_in x encoder_dim
     encoder_outputs = self.encoder(inputs)
     if self.gst:
         # B x gst_dim
         encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
     if self.num_speakers > 1:
         encoder_outputs = self._concat_speaker_embedding(
             encoder_outputs, self.speaker_embeddings)
     # decoder_outputs: B x decoder_dim x T_out
     # alignments: B x T_in x encoder_dim
     # stop_tokens: B x T_in
     decoder_outputs, alignments, stop_tokens = self.decoder(
         encoder_outputs, mel_specs, mask,
         self.speaker_embeddings_projected)
     # B x T_out x decoder_dim
     postnet_outputs = self.postnet(decoder_outputs)
     # B x T_out x posnet_dim
     postnet_outputs = self.last_linear(postnet_outputs)
     # B x T_out x decoder_dim
     decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
     if self.bidirectional_decoder:
         decoder_outputs_backward, alignments_backward = self._backward_inference(
             mel_specs, encoder_outputs, mask)
         return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
     return decoder_outputs, postnet_outputs, alignments, stop_tokens
Ejemplo n.º 11
0
    def test_in_out(self):
        layer = L1LossMasked()
        dummy_input = T.ones(4, 8, 128).float()
        dummy_target = T.ones(4, 8, 128).float()
        dummy_length = (T.ones(4) * 8).long()
        output = layer(dummy_input, dummy_target, dummy_length)
        assert output.item() == 0.0

        dummy_input = T.ones(4, 8, 128).float()
        dummy_target = T.zeros(4, 8, 128).float()
        dummy_length = (T.ones(4) * 8).long()
        output = layer(dummy_input, dummy_target, dummy_length)
        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
        dummy_input = T.ones(4, 8, 128).float()
        dummy_target = T.zeros(4, 8, 128).float()
        dummy_length = (T.arange(5, 9)).long()
        mask = ((sequence_mask(dummy_length).float() - 1.0) *
                100.0).unsqueeze(2)
        output = layer(dummy_input + mask, dummy_target, dummy_length)
        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
Ejemplo n.º 12
0
 def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
     B = characters.size(0)
     mask = sequence_mask(text_lengths).to(characters.device)
     inputs = self.embedding(characters)
     self._init_states()
     self.compute_speaker_embedding(speaker_ids)
     if self.num_speakers > 1:
         inputs = self._concat_speaker_embedding(inputs,
                                                 self.speaker_embeddings)
     encoder_outputs = self.encoder(inputs)
     if self.gst:
         encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
     if self.num_speakers > 1:
         encoder_outputs = self._concat_speaker_embedding(
             encoder_outputs, self.speaker_embeddings)
     mel_outputs, alignments, stop_tokens = self.decoder(
         encoder_outputs, mel_specs, mask,
         self.speaker_embeddings_projected)
     mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
     linear_outputs = self.postnet(mel_outputs)
     linear_outputs = self.last_linear(linear_outputs)
     return mel_outputs, linear_outputs, alignments, stop_tokens
Ejemplo n.º 13
0
 def forward(self, x, target, length):
     """
     Args:
         x: A Variable containing a FloatTensor of size
             (batch, max_len, dim) which contains the
             unnormalized probability for each class.
         target: A Variable containing a LongTensor of size
             (batch, max_len, dim) which contains the index of the true
             class for each corresponding step.
         length: A Variable containing a LongTensor of size (batch,)
             which contains the length of each data in a batch.
     Returns:
         loss: An average loss value in range [0, 1] masked by the length.
     """
     # mask: (batch, max_len, 1)
     target.requires_grad = False
     mask = sequence_mask(sequence_length=length,
                          max_len=target.size(1)).unsqueeze(2).float()
     mask = mask.expand_as(x)
     loss = functional.l1_loss(x * mask, target * mask, reduction="sum")
     loss = loss / mask.sum()
     return loss
Ejemplo n.º 14
0
 def _make_masks(ilens, olens):
     in_masks = sequence_mask(ilens)
     out_masks = sequence_mask(olens)
     return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
Ejemplo n.º 15
0
        stop_targets = (stop_targets.sum(2) >
                        0.0).unsqueeze(2).float().squeeze(2)

    # dispatch data to GPU
    if use_cuda:
        text_input = text_input.cuda()
        text_lengths = text_lengths.cuda()
        mel_input = mel_input.cuda()
        mel_lengths = mel_lengths.cuda()
        if linear_input is not None:
            linear_input = linear_input.cuda()
        stop_targets = stop_targets.cuda()
        if speaker_ids is not None:
            speaker_ids = speaker_ids.cuda()

    mask = sequence_mask(text_lengths)
    # print(text_input, text_lengths, mel_input, speaker_ids)
    mel_outputs, postnet_outputs, alignments, stop_tokens = model(
        text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
    # print(mel_outputs, postnet_outputs, alignments, stop_tokens)
    # compute mel specs from linear spec if model is Tacotron
    mel_specs = []
    if C.model == "Tacotron":
        postnet_outputs = postnet_outputs.data.cpu().numpy()
        for b in range(postnet_outputs.shape[0]):
            postnet_output = postnet_outputs[b]
            mel_specs.append(
                torch.FloatTensor(ap.out_linear_to_mel(
                    postnet_output.T).T).cuda())
        postnet_outputs = torch.stack(mel_specs)