Beispiel #1
0
    def __call__(
        self,
        wave: Tensor,
        silence: Tensor,
        local: Tensor,
        source: Tensor,
        source2: Tensor,
        signal: Tensor,
        speaker_id: Tensor = None,
    ):
        batch_size = len(wave)

        local_padding_length = int(self.generator.sampling_rate *
                                   self.local_padding_time_length)

        output = self.generator.generate(
            local=local,
            source=source,
            speaker_id=speaker_id,
            local_padding_length=local_padding_length,
        )

        mcd_list = []
        for wi, wo in zip(wave.cpu().numpy(), output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)
            mcd = calc_mcd(wave1=wi, wave2=wo)
            mcd_list.append(mcd)

        scores = {
            "mcd": (numpy.mean(mcd_list), batch_size),
        }

        report(scores, self)
        return scores
Beispiel #2
0
    def forward(
        self,
        f0: Tensor,
        phoneme: Tensor,
        spec: Tensor,
        silence: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = spec.shape[0]

        output = self.predictor(
            f0=f0,
            phoneme=phoneme,
            spec=spec.roll(1, dims=1),
            speaker_id=speaker_id,
        )

        loss = F.l1_loss(input=output, target=spec, reduction="none")

        mask = padded
        if self.model_config.eliminate_silence:
            mask = torch.logical_or(mask, silence)
        loss = loss[~mask]

        loss = loss.mean()

        # report
        losses = dict(loss=loss)
        if not self.training:
            weight = (~mask).to(torch.float32).mean() * batch_size
            losses = {key: (l, weight) for key, l in losses.items()}
        report(losses, self)

        return loss
 def test_report(self):
     reporter = pytorch_trainer.Reporter()
     with reporter:
         pytorch_trainer.report({'x': 1})
     observation = reporter.observation
     self.assertIn('x', observation)
     self.assertEqual(observation['x'], 1)
    def forward(
        self,
        wave: Tensor,
        local: Tensor,
        speaker_id: Tensor = None,
    ):
        batch_size = len(wave)

        local_padding_length = int(self.generator.sampling_rate *
                                   self.local_padding_time_second)

        output = self.generator.generate(
            local=local,
            local_padding_length=local_padding_length,
            speaker_id=speaker_id,
        )

        mcd_list = []
        for wi, wo in zip(wave.cpu().numpy(), output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)
            try:
                mcd = calc_mcd(wave1=wi, wave2=wo)
            except Exception:
                mcd = numpy.nan
            mcd_list.append(mcd)

        scores = {
            "mcd": (numpy.mean(mcd_list), batch_size),
        }

        report(scores, self)
        return scores
Beispiel #5
0
    def __call__(
        self,
        phoneme_list: Tensor,
        phoneme_length: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = len(phoneme_list)

        output = self.predictor(
            phoneme_list=phoneme_list,
            speaker_id=speaker_id,
        )

        mask = ~padded
        if self.model_config.eliminate_pause:
            mask = torch.logical_and(mask, phoneme_list != 0)

        loss = F.l1_loss(output[mask], phoneme_length[mask], reduction="none")
        loss = loss.mean()

        # report
        values = dict(loss=loss)
        if not self.training:
            weight = mask.to(torch.float32).mean() * batch_size
            values = {key: (l, weight)
                      for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #6
0
    def __call__(
        self,
        f0: Tensor,
        phoneme: Tensor,
        spec: Tensor,
        silence: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = len(spec)
        numpy_padded = padded.cpu().numpy()

        out_spec = self.generator.generate(
            f0=f0,
            phoneme=phoneme,
            speaker_id=speaker_id,
        )
        out_spec = out_spec[~numpy_padded]

        in_spec = spec.cpu().numpy()[~numpy_padded]

        diff = numpy.abs(out_spec - in_spec).mean()
        mcd = _mcd(out_spec, in_spec)

        weight = (~numpy_padded).mean() * batch_size
        scores = {"diff": (diff, weight), "mcd": (mcd, weight)}

        report(scores, self)
        return scores
 def test_report_with_observer(self):
     reporter = pytorch_trainer.Reporter()
     observer = object()
     reporter.add_observer('o', observer)
     with reporter:
         pytorch_trainer.report({'x': 1}, observer)
     observation = reporter.observation
     self.assertIn('o/x', observation)
     self.assertEqual(observation['o/x'], 1)
    def test_report_scope(self):
        reporter = pytorch_trainer.Reporter()
        observation = {}

        with reporter:
            with pytorch_trainer.report_scope(observation):
                pytorch_trainer.report({'x': 1})

        self.assertIn('x', observation)
        self.assertEqual(observation['x'], 1)
        self.assertNotIn('x', reporter.observation)
Beispiel #9
0
    def __call__(
        self,
        coarse: Tensor,
        encoded_coarse: Tensor,
        local: Tensor,
        silence: Tensor,
        randomed_encoded_coarse: Optional[Tensor] = None,
        speaker_num: Optional[Tensor] = None,
    ):
        x_array = (
            encoded_coarse
            if randomed_encoded_coarse is None
            else randomed_encoded_coarse
        )

        out_c_array, _ = self.predictor(
            x_array=x_array,
            l_array=local,
            s_one=speaker_num,
            local_padding_size=self.local_padding_size,
        )

        if self.cbl_weight is not None:
            if self.cbl_weight.device != out_c_array.device:
                self.cbl_weight = self.cbl_weight.to(out_c_array.device)

        target_coarse = encoded_coarse[:, 1:]
        nll_coarse = F.cross_entropy(
            out_c_array, target_coarse, reduction="none", weight=self.cbl_weight
        )

        silence_weight = self.loss_config.silence_weight
        if silence_weight == 0:
            nll_coarse = nll_coarse[~silence]
        elif silence_weight < 0:
            nll_coarse = nll_coarse[~silence] + nll_coarse[silence] * silence_weight

        nll_coarse = (
            torch.mean(nll_coarse)
            if self.loss_config.mean_silence
            else torch.sum(nll_coarse) / silence.size
        )

        loss = nll_coarse
        losses = dict(loss=loss, nll_coarse=nll_coarse)

        if not self.training:
            losses = {key: (l, len(coarse)) for key, l in losses.items()}  # add weight
        report(losses, self)
        return loss
Beispiel #10
0
    def __call__(
        self,
        f0: Tensor,
        phoneme: Tensor,
        silence: Tensor,
        start_accent: Tensor,
        end_accent: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = len(f0)

        d = self.predictor(
            phoneme=phoneme,
            start_accent=start_accent,
            end_accent=end_accent,
            f0=f0.roll(1, dims=1),
            speaker_id=speaker_id,
        )
        output_f0 = d["f0"][~padded]
        output_vuv = d["vuv"][~padded]

        f0 = f0[~padded]
        vuv = f0 != 0

        loss_f0 = F.l1_loss(output_f0[vuv], f0[vuv])
        loss_vuv = F.binary_cross_entropy_with_logits(output_vuv,
                                                      vuv.to(torch.float32))

        loss_f0 = loss_f0 * self.model_config.f0_loss_weight
        loss_vuv = loss_vuv * self.model_config.vuv_loss_weight
        loss = loss_f0 + loss_vuv

        # report
        values = dict(
            loss=loss,
            loss_f0=loss_f0,
            loss_vuv=loss_vuv,
        )
        if not self.training:
            weight = batch_size
            values = {key: (l, weight)
                      for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #11
0
    def forward(
        self, x: Tensor, x_ref: Optional[Tensor], z: Optional[Tensor], prefix: str,
    ):
        assert (x_ref is None) != (z is None)

        pad = self.model_config.padding_length

        # r1 loss
        with torch.enable_grad():
            x_r1 = x[:, pad * 2 : -pad * 2]
            x_r1.requires_grad_()
            real = self.discriminator(x_r1)
            loss_r1 = calc_r1_loss(output=real, input=x_r1)

        # real loss
        loss_real = calc_adversarial_loss(x=real, is_real=True)

        # fake loss
        with torch.no_grad():
            if z is not None:
                s = self.mapping_network(z)
            else:
                s = self.style_encoder(x_ref)

            y = self.style_transfer(x=x, s=s)
        loss_fake = calc_adversarial_loss(
            x=self.discriminator(y[:, pad:-pad]), is_real=False
        )

        loss = loss_real + loss_fake + self.model_config.r1_weight * loss_r1

        # report
        values = {
            f"{prefix}/loss": loss,
            f"{prefix}/loss_real": loss_real,
            f"{prefix}/loss_fake": loss_fake,
            f"{prefix}/loss_r1": loss_r1,
        }
        if not self.training:
            weight = x.shape[0]
            values = {key: (l, weight) for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #12
0
    def forward(
        self,
        vowel_phoneme_list: Tensor,
        consonant_phoneme_list: Tensor,
        start_accent_list: Tensor,
        end_accent_list: Tensor,
        start_accent_phrase_list: Tensor,
        end_accent_phrase_list: Tensor,
        f0: Tensor,
        voiced: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = len(vowel_phoneme_list)

        output_f0 = self.predictor(
            vowel_phoneme_list=vowel_phoneme_list,
            consonant_phoneme_list=consonant_phoneme_list,
            start_accent_list=start_accent_list,
            end_accent_list=end_accent_list,
            start_accent_phrase_list=start_accent_phrase_list,
            end_accent_phrase_list=end_accent_phrase_list,
            f0=f0.roll(1, dims=1),
            speaker_id=speaker_id,
        )

        mask = torch.logical_and(voiced, ~padded)
        f0_loss = F.l1_loss(output_f0[mask], f0[mask], reduction="none")
        f0_loss = f0_loss.mean() * self.model_config.f0_loss_weight
        loss = f0_loss

        values = dict(loss=loss, f0_loss=f0_loss)

        # report
        if not self.training:
            weight = (~padded).to(torch.float32).mean() * batch_size
            values = {key: (l, weight)
                      for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #13
0
    def __call__(
            self,
            input: Tensor,
            target: Tensor,
    ):
        feature = self.predictor(input)
        output = self.tail(feature, target)

        loss = cross_entropy(output, target)

        # report
        values = dict(
            loss=loss,
            accuracy=accuracy(output, target),
        )
        if not self.training:
            weight = input.shape[0]
            values = {key: (l, weight) for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #14
0
    def __call__(
        self,
        wave: Tensor,
        local: Optional[Tensor],
        speaker_num: Optional[Tensor] = None,
    ):
        batchsize = len(wave)

        wave_output = self.generator.generate(
            time_length=self.time_length + self.local_padding_time_length * 2,
            sampling_policy=self.sampling_policy,
            num_generate=batchsize,
            local_array=local,
            speaker_nums=speaker_num,
        )

        mcd_list = []
        sil_acc_list = []
        for wi, wo in zip(wave.cpu().numpy(), wave_output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)

            if self.local_padding_time_length > 0:
                pad = int(wo.sampling_rate * self.local_padding_time_length)
                wo.wave = wo.wave[pad:-pad]

            mcd = calc_mcd(wave1=wi, wave2=wo)
            mcd_list.append(mcd)

            accuracy = calc_silence_rate(wave1=wi, wave2=wo)
            sil_acc_list.append(accuracy)

        scores = {
            "mcd": (numpy.mean(mcd_list), batchsize),
            "sil_acc": (numpy.mean(sil_acc_list), batchsize),
        }

        report(scores, self)
        return scores
Beispiel #15
0
    def forward(
        self,
        f0: Tensor,
        phoneme: Tensor,
        silence: Tensor,
        start_accent: Tensor,
        end_accent: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = len(f0)

        out_f0 = self.generator.generate(
            phoneme=phoneme,
            start_accent=start_accent,
            end_accent=end_accent,
            speaker_id=speaker_id,
        )
        out_f0 = out_f0[~padded.cpu().numpy()]
        out_vuv = out_f0 != 0

        in_f0 = f0[~padded].cpu().numpy()
        in_vuv = in_f0 != 0

        vuv = numpy.bitwise_and(out_vuv, in_vuv)

        f0_diff = numpy.abs(out_f0[vuv] - in_f0[vuv]).mean()
        vuv_acc = (out_vuv == in_vuv).mean()

        scores = {
            "f0_diff": (f0_diff, batch_size),
            "vuv_acc": (vuv_acc, batch_size)
        }

        report(scores, self)
        return scores
Beispiel #16
0
    def __call__(
        self,
        vowel_phoneme_list: Tensor,
        consonant_phoneme_list: Tensor,
        start_accent_list: Tensor,
        end_accent_list: Tensor,
        start_accent_phrase_list: Tensor,
        end_accent_phrase_list: Tensor,
        f0: Tensor,
        voiced: Tensor,
        padded: Tensor,
        speaker_id: Optional[Tensor] = None,
    ):
        batch_size = vowel_phoneme_list.shape[0]
        numpy_mask = torch.logical_and(voiced, ~padded).cpu().numpy()

        out_f0 = self.generator.generate(
            vowel_phoneme_list=vowel_phoneme_list,
            consonant_phoneme_list=consonant_phoneme_list,
            start_accent_list=start_accent_list,
            end_accent_list=end_accent_list,
            start_accent_phrase_list=start_accent_phrase_list,
            end_accent_phrase_list=end_accent_phrase_list,
            speaker_id=speaker_id,
        )
        out_f0 = out_f0[numpy_mask]

        in_f0 = f0.cpu().numpy()[numpy_mask]

        diff = numpy.abs(out_f0 - in_f0).mean()

        weight = (numpy_mask).mean() * batch_size
        scores = {"diff": (diff, weight)}

        report(scores, self)
        return scores
Beispiel #17
0
    def forward(
        self,
        wave: Tensor,
        local: Tensor,
        speaker_id: Tensor = None,
    ):
        batch_size = wave.shape[0]
        sample_size = self.model_config.sample_size
        latent_size = self.model_config.latent_size

        noise_level = self.noise_scheduler.sample_noise_level(num=batch_size)
        noise = self.predictor.generate_noise(*wave.shape)

        latent = None
        if sample_size <= 1:
            assert latent_size == 0

        else:
            assert latent_size > 0

            latent_list = []
            for i_data in range(batch_size):
                latent = self.predictor.generate_noise(sample_size,
                                                       latent_size,
                                                       local.shape[2])

                with torch.no_grad():
                    loss = self.one_forward(
                        noise_level=noise_level[i_data:i_data + 1].expand(
                            (sample_size, ) + noise_level.shape[1:]),
                        noise=noise[i_data:i_data + 1].expand((sample_size, ) +
                                                              noise.shape[1:]),
                        wave=wave[i_data:i_data + 1].expand((sample_size, ) +
                                                            wave.shape[1:]),
                        latent=latent,
                        local=local[i_data:i_data + 1].expand((sample_size, ) +
                                                              local.shape[1:]),
                        speaker_id=(speaker_id[i_data:i_data +
                                               1].expand((sample_size, ) +
                                                         speaker_id.shape[1:])
                                    if speaker_id is not None else None),
                    )

                i_sample = loss.mean(1).argmax(0)
                latent_list.append(latent[i_sample])

            latent = torch.stack(latent_list)

        loss = self.one_forward(
            noise_level=noise_level,
            noise=noise,
            wave=wave,
            latent=latent,
            local=local,
            speaker_id=speaker_id,
        ).mean()

        # report
        values = dict(loss=loss)
        if not self.training:
            values = {key: (l, batch_size)
                      for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #18
0
    def forward(
        self,
        x: Tensor,
        x_ref1: Optional[Tensor],
        x_ref2: Optional[Tensor],
        z1: Optional[Tensor],
        z2: Optional[Tensor],
        prefix: str,
    ):
        assert (x_ref1 is None) != (z1 is None)

        pad = self.model_config.padding_length

        if z1 is not None:
            s1 = self.mapping_network(z1)
            s2 = self.mapping_network(z2)
        else:
            s1 = self.style_encoder(x_ref1)
            s2 = self.style_encoder(x_ref2)

        # adversarial loss
        y1 = self.style_transfer(x=x, s=s1)
        loss_adv = calc_adversarial_loss(
            x=self.discriminator(y1[:, pad:-pad]), is_real=True
        )

        # style reconstruction loss
        s1_re = self.style_encoder(y1[:, pad:-pad])
        loss_style = torch.mean(torch.abs(s1_re - s1))

        # diversity sensitive loss
        y2 = self.style_transfer(x=x, s=s2)
        y2 = y2.detach()
        loss_diverse = -torch.mean(torch.abs(y1[:, pad:-pad] - y2[:, pad:-pad]))

        # cycle-consistency loss
        s_x = self.style_encoder(x[:, pad * 2 : -pad * 2])
        x_re = self.style_transfer(y1, s_x)
        loss_cycle = torch.mean(torch.abs(x_re - x[:, pad * 2 : -pad * 2]))

        # identification loss
        x_id = self.style_transfer(x, s_x)
        loss_identify = torch.mean(
            torch.abs(x_id[:, pad:-pad] - x[:, pad * 2 : -pad * 2])
        )

        loss = (
            loss_adv
            + self.model_config.style_reconstruction_weight * loss_style
            + self.model_config.diversity_sensitive_weight * loss_diverse
            + self.model_config.cycle_consistency_weight * loss_cycle
            + self.model_config.identification_weight * loss_identify
        )

        # report
        values = {
            f"{prefix}/loss": loss,
            f"{prefix}/loss_adv": loss_adv,
            f"{prefix}/loss_style": loss_style,
            f"{prefix}/loss_diverse": loss_diverse,
            f"{prefix}/loss_cycle": loss_cycle,
            f"{prefix}/loss_identify": loss_identify,
        }
        if not self.training:
            weight = x.shape[0]
            values = {key: (l, weight) for key, l in values.items()}  # add weight
        report(values, self)

        return loss
 def test_report_without_reporter(self):
     observer = object()
     pytorch_trainer.report({'x': 1}, observer)
 def test_report_with_unregistered_observer(self):
     reporter = pytorch_trainer.Reporter()
     observer = object()
     with reporter:
         with self.assertRaises(KeyError):
             pytorch_trainer.report({'x': 1}, observer)
 def forward(self, x, y):
     self.args.append((x, y))
     pytorch_trainer.report({'loss': x.sum() + y.sum()}, self)
Beispiel #22
0
    def __call__(
        self,
        f0: Sequence[Tensor],
        phoneme: Sequence[Tensor],
        phoneme_list: Sequence[Tensor],
        speaker_id: Optional[Sequence[Tensor]] = None,
    ):
        batch_size = len(f0)

        speaker_id = torch.stack(speaker_id)

        d = self.predictor(
            f0=[h[:-1] for h in f0],
            phoneme=[h[:-1] for h in phoneme],
            phoneme_list=phoneme_list,
            speaker_id=speaker_id,
        )
        output_f0 = d["f0"]
        output_phoneme = d["phoneme"]
        output_vuv = d["vuv"]
        output_stop = d["stop"]

        stacked_f0 = torch.cat(f0)
        stacked_phoneme = torch.cat(phoneme)
        stacked_vuv = stacked_f0 != 0

        stacked_stop = torch.zeros_like(stacked_f0, dtype=torch.bool)
        stacked_stop[numpy.cumsum([h.shape[0] for h in f0]) - 1] = True

        # loss
        loss_f0 = F.l1_loss(output_f0[stacked_vuv], stacked_f0[stacked_vuv])
        loss_phoneme = F.cross_entropy(output_phoneme, stacked_phoneme)
        loss_vuv = F.binary_cross_entropy_with_logits(
            output_vuv, stacked_vuv.to(torch.float32)
        )
        loss_stop = F.binary_cross_entropy_with_logits(
            output_stop,
            stacked_stop.to(torch.float32),
            pos_weight=torch.ones_like(output_stop) * 10,
        )

        loss_f0 = loss_f0 * self.model_config.f0_loss_weight
        loss_phoneme = loss_phoneme * self.model_config.phoneme_loss_weight
        loss_vuv = loss_vuv * self.model_config.vuv_loss_weight
        loss_stop = loss_stop * self.model_config.stop_loss_weight
        loss = loss_f0 + loss_phoneme + loss_vuv + loss_stop

        # metric
        accuracy_phoneme = accuracy(output_phoneme, stacked_phoneme)
        accuracy_vuv = accuracy(output_vuv, stacked_vuv)
        accuracy_stop = accuracy(output_stop, stacked_stop)

        # report
        values = dict(
            loss=loss,
            loss_f0=loss_f0,
            loss_phoneme=loss_phoneme,
            loss_vuv=loss_vuv,
            loss_stop=loss_stop,
            accuracy_phoneme=accuracy_phoneme,
            accuracy_vuv=accuracy_vuv,
            accuracy_stop=accuracy_stop,
        )
        if not self.training:
            weight = batch_size
            values = {key: (l, weight) for key, l in values.items()}  # add weight
        report(values, self)

        return loss
Beispiel #23
0
    def forward(
        self,
        wave: Tensor,
        f0: Tensor,
        phoneme: Tensor,
        padded: Tensor,
        speaker: Optional[Tensor] = None,
    ):
        batch_size = wave.shape[0]
        length = f0.shape[1]
        mask = ~padded

        voiced = f0 != 0
        long_voiced = voiced.long()

        features = self.predictor(wave, return_with_splited=True)
        feature = features["feature"].transpose(1, 2).reshape(
            batch_size * length, -1)
        voiced_feature = (features["voiced"].transpose(1, 2).reshape(
            batch_size * length, -1))
        f0_feature = features["f0"].transpose(1, 2)
        phoneme_feature = (features["phoneme"].transpose(1, 2).reshape(
            batch_size * length, -1))

        voiced_output = self.voiced_network(voiced_feature)
        if self.training:
            phoneme_output = self.phoneme_network(phoneme_feature, phoneme)
        else:
            phoneme_output = self.phoneme_network(phoneme_feature)

        voiced_loss = F.cross_entropy(
            voiced_output, long_voiced.reshape(-1),
            reduction="none")[mask.reshape(-1)].mean()
        phoneme_loss = F.cross_entropy(
            phoneme_output, phoneme.reshape(-1),
            reduction="none")[mask.reshape(-1)].mean()

        f0_output = self.f0_network(x=f0_feature)
        f0_loss = F.l1_loss(f0_output[voiced], f0[voiced],
                            reduction="none")[mask[voiced]].mean()

        if speaker is not None:
            expanded_speaker = speaker.unsqueeze(1).expand(batch_size, length)

            speaker_output = self.speaker_network(feature.detach())

            speaker_loss = F.cross_entropy(speaker_output,
                                           expanded_speaker.reshape(-1))
            speaker_accuracy = accuracy(speaker_output,
                                        expanded_speaker.reshape(-1))
        else:
            speaker_loss = 0
            speaker_accuracy = 0

        predictor_loss = (self.config.voiced_loss_weight * voiced_loss +
                          self.config.f0_loss_weight * f0_loss +
                          self.config.phoneme_loss_weight * phoneme_loss)
        loss = predictor_loss + speaker_loss

        # report
        values = dict(
            loss=loss,
            predictor_loss=predictor_loss,
            voiced_loss=voiced_loss,
            f0_loss=f0_loss,
            phoneme_loss=phoneme_loss,
            speaker_loss=speaker_loss,
            voiced_accuracy=accuracy(voiced_output,
                                     long_voiced.reshape(-1),
                                     mask=mask.reshape(-1)),
            phoneme_accuracy=accuracy(phoneme_output,
                                      phoneme.reshape(-1),
                                      mask=mask.reshape(-1)),
            speaker_accuracy=speaker_accuracy,
        )
        if not self.training:
            values = {key: (l, batch_size)
                      for key, l in values.items()}  # add weight
        report(values, self)

        return loss