def __call__( self, wave: Tensor, silence: Tensor, local: Tensor, source: Tensor, source2: Tensor, signal: Tensor, speaker_id: Tensor = None, ): batch_size = len(wave) local_padding_length = int(self.generator.sampling_rate * self.local_padding_time_length) output = self.generator.generate( local=local, source=source, speaker_id=speaker_id, local_padding_length=local_padding_length, ) mcd_list = [] for wi, wo in zip(wave.cpu().numpy(), output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) mcd = calc_mcd(wave1=wi, wave2=wo) mcd_list.append(mcd) scores = { "mcd": (numpy.mean(mcd_list), batch_size), } report(scores, self) return scores
def forward( self, f0: Tensor, phoneme: Tensor, spec: Tensor, silence: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = spec.shape[0] output = self.predictor( f0=f0, phoneme=phoneme, spec=spec.roll(1, dims=1), speaker_id=speaker_id, ) loss = F.l1_loss(input=output, target=spec, reduction="none") mask = padded if self.model_config.eliminate_silence: mask = torch.logical_or(mask, silence) loss = loss[~mask] loss = loss.mean() # report losses = dict(loss=loss) if not self.training: weight = (~mask).to(torch.float32).mean() * batch_size losses = {key: (l, weight) for key, l in losses.items()} report(losses, self) return loss
def test_report(self): reporter = pytorch_trainer.Reporter() with reporter: pytorch_trainer.report({'x': 1}) observation = reporter.observation self.assertIn('x', observation) self.assertEqual(observation['x'], 1)
def forward( self, wave: Tensor, local: Tensor, speaker_id: Tensor = None, ): batch_size = len(wave) local_padding_length = int(self.generator.sampling_rate * self.local_padding_time_second) output = self.generator.generate( local=local, local_padding_length=local_padding_length, speaker_id=speaker_id, ) mcd_list = [] for wi, wo in zip(wave.cpu().numpy(), output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) try: mcd = calc_mcd(wave1=wi, wave2=wo) except Exception: mcd = numpy.nan mcd_list.append(mcd) scores = { "mcd": (numpy.mean(mcd_list), batch_size), } report(scores, self) return scores
def __call__( self, phoneme_list: Tensor, phoneme_length: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = len(phoneme_list) output = self.predictor( phoneme_list=phoneme_list, speaker_id=speaker_id, ) mask = ~padded if self.model_config.eliminate_pause: mask = torch.logical_and(mask, phoneme_list != 0) loss = F.l1_loss(output[mask], phoneme_length[mask], reduction="none") loss = loss.mean() # report values = dict(loss=loss) if not self.training: weight = mask.to(torch.float32).mean() * batch_size values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def __call__( self, f0: Tensor, phoneme: Tensor, spec: Tensor, silence: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = len(spec) numpy_padded = padded.cpu().numpy() out_spec = self.generator.generate( f0=f0, phoneme=phoneme, speaker_id=speaker_id, ) out_spec = out_spec[~numpy_padded] in_spec = spec.cpu().numpy()[~numpy_padded] diff = numpy.abs(out_spec - in_spec).mean() mcd = _mcd(out_spec, in_spec) weight = (~numpy_padded).mean() * batch_size scores = {"diff": (diff, weight), "mcd": (mcd, weight)} report(scores, self) return scores
def test_report_with_observer(self): reporter = pytorch_trainer.Reporter() observer = object() reporter.add_observer('o', observer) with reporter: pytorch_trainer.report({'x': 1}, observer) observation = reporter.observation self.assertIn('o/x', observation) self.assertEqual(observation['o/x'], 1)
def test_report_scope(self): reporter = pytorch_trainer.Reporter() observation = {} with reporter: with pytorch_trainer.report_scope(observation): pytorch_trainer.report({'x': 1}) self.assertIn('x', observation) self.assertEqual(observation['x'], 1) self.assertNotIn('x', reporter.observation)
def __call__( self, coarse: Tensor, encoded_coarse: Tensor, local: Tensor, silence: Tensor, randomed_encoded_coarse: Optional[Tensor] = None, speaker_num: Optional[Tensor] = None, ): x_array = ( encoded_coarse if randomed_encoded_coarse is None else randomed_encoded_coarse ) out_c_array, _ = self.predictor( x_array=x_array, l_array=local, s_one=speaker_num, local_padding_size=self.local_padding_size, ) if self.cbl_weight is not None: if self.cbl_weight.device != out_c_array.device: self.cbl_weight = self.cbl_weight.to(out_c_array.device) target_coarse = encoded_coarse[:, 1:] nll_coarse = F.cross_entropy( out_c_array, target_coarse, reduction="none", weight=self.cbl_weight ) silence_weight = self.loss_config.silence_weight if silence_weight == 0: nll_coarse = nll_coarse[~silence] elif silence_weight < 0: nll_coarse = nll_coarse[~silence] + nll_coarse[silence] * silence_weight nll_coarse = ( torch.mean(nll_coarse) if self.loss_config.mean_silence else torch.sum(nll_coarse) / silence.size ) loss = nll_coarse losses = dict(loss=loss, nll_coarse=nll_coarse) if not self.training: losses = {key: (l, len(coarse)) for key, l in losses.items()} # add weight report(losses, self) return loss
def __call__( self, f0: Tensor, phoneme: Tensor, silence: Tensor, start_accent: Tensor, end_accent: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = len(f0) d = self.predictor( phoneme=phoneme, start_accent=start_accent, end_accent=end_accent, f0=f0.roll(1, dims=1), speaker_id=speaker_id, ) output_f0 = d["f0"][~padded] output_vuv = d["vuv"][~padded] f0 = f0[~padded] vuv = f0 != 0 loss_f0 = F.l1_loss(output_f0[vuv], f0[vuv]) loss_vuv = F.binary_cross_entropy_with_logits(output_vuv, vuv.to(torch.float32)) loss_f0 = loss_f0 * self.model_config.f0_loss_weight loss_vuv = loss_vuv * self.model_config.vuv_loss_weight loss = loss_f0 + loss_vuv # report values = dict( loss=loss, loss_f0=loss_f0, loss_vuv=loss_vuv, ) if not self.training: weight = batch_size values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def forward( self, x: Tensor, x_ref: Optional[Tensor], z: Optional[Tensor], prefix: str, ): assert (x_ref is None) != (z is None) pad = self.model_config.padding_length # r1 loss with torch.enable_grad(): x_r1 = x[:, pad * 2 : -pad * 2] x_r1.requires_grad_() real = self.discriminator(x_r1) loss_r1 = calc_r1_loss(output=real, input=x_r1) # real loss loss_real = calc_adversarial_loss(x=real, is_real=True) # fake loss with torch.no_grad(): if z is not None: s = self.mapping_network(z) else: s = self.style_encoder(x_ref) y = self.style_transfer(x=x, s=s) loss_fake = calc_adversarial_loss( x=self.discriminator(y[:, pad:-pad]), is_real=False ) loss = loss_real + loss_fake + self.model_config.r1_weight * loss_r1 # report values = { f"{prefix}/loss": loss, f"{prefix}/loss_real": loss_real, f"{prefix}/loss_fake": loss_fake, f"{prefix}/loss_r1": loss_r1, } if not self.training: weight = x.shape[0] values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def forward( self, vowel_phoneme_list: Tensor, consonant_phoneme_list: Tensor, start_accent_list: Tensor, end_accent_list: Tensor, start_accent_phrase_list: Tensor, end_accent_phrase_list: Tensor, f0: Tensor, voiced: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = len(vowel_phoneme_list) output_f0 = self.predictor( vowel_phoneme_list=vowel_phoneme_list, consonant_phoneme_list=consonant_phoneme_list, start_accent_list=start_accent_list, end_accent_list=end_accent_list, start_accent_phrase_list=start_accent_phrase_list, end_accent_phrase_list=end_accent_phrase_list, f0=f0.roll(1, dims=1), speaker_id=speaker_id, ) mask = torch.logical_and(voiced, ~padded) f0_loss = F.l1_loss(output_f0[mask], f0[mask], reduction="none") f0_loss = f0_loss.mean() * self.model_config.f0_loss_weight loss = f0_loss values = dict(loss=loss, f0_loss=f0_loss) # report if not self.training: weight = (~padded).to(torch.float32).mean() * batch_size values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def __call__( self, input: Tensor, target: Tensor, ): feature = self.predictor(input) output = self.tail(feature, target) loss = cross_entropy(output, target) # report values = dict( loss=loss, accuracy=accuracy(output, target), ) if not self.training: weight = input.shape[0] values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def __call__( self, wave: Tensor, local: Optional[Tensor], speaker_num: Optional[Tensor] = None, ): batchsize = len(wave) wave_output = self.generator.generate( time_length=self.time_length + self.local_padding_time_length * 2, sampling_policy=self.sampling_policy, num_generate=batchsize, local_array=local, speaker_nums=speaker_num, ) mcd_list = [] sil_acc_list = [] for wi, wo in zip(wave.cpu().numpy(), wave_output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) if self.local_padding_time_length > 0: pad = int(wo.sampling_rate * self.local_padding_time_length) wo.wave = wo.wave[pad:-pad] mcd = calc_mcd(wave1=wi, wave2=wo) mcd_list.append(mcd) accuracy = calc_silence_rate(wave1=wi, wave2=wo) sil_acc_list.append(accuracy) scores = { "mcd": (numpy.mean(mcd_list), batchsize), "sil_acc": (numpy.mean(sil_acc_list), batchsize), } report(scores, self) return scores
def forward( self, f0: Tensor, phoneme: Tensor, silence: Tensor, start_accent: Tensor, end_accent: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = len(f0) out_f0 = self.generator.generate( phoneme=phoneme, start_accent=start_accent, end_accent=end_accent, speaker_id=speaker_id, ) out_f0 = out_f0[~padded.cpu().numpy()] out_vuv = out_f0 != 0 in_f0 = f0[~padded].cpu().numpy() in_vuv = in_f0 != 0 vuv = numpy.bitwise_and(out_vuv, in_vuv) f0_diff = numpy.abs(out_f0[vuv] - in_f0[vuv]).mean() vuv_acc = (out_vuv == in_vuv).mean() scores = { "f0_diff": (f0_diff, batch_size), "vuv_acc": (vuv_acc, batch_size) } report(scores, self) return scores
def __call__( self, vowel_phoneme_list: Tensor, consonant_phoneme_list: Tensor, start_accent_list: Tensor, end_accent_list: Tensor, start_accent_phrase_list: Tensor, end_accent_phrase_list: Tensor, f0: Tensor, voiced: Tensor, padded: Tensor, speaker_id: Optional[Tensor] = None, ): batch_size = vowel_phoneme_list.shape[0] numpy_mask = torch.logical_and(voiced, ~padded).cpu().numpy() out_f0 = self.generator.generate( vowel_phoneme_list=vowel_phoneme_list, consonant_phoneme_list=consonant_phoneme_list, start_accent_list=start_accent_list, end_accent_list=end_accent_list, start_accent_phrase_list=start_accent_phrase_list, end_accent_phrase_list=end_accent_phrase_list, speaker_id=speaker_id, ) out_f0 = out_f0[numpy_mask] in_f0 = f0.cpu().numpy()[numpy_mask] diff = numpy.abs(out_f0 - in_f0).mean() weight = (numpy_mask).mean() * batch_size scores = {"diff": (diff, weight)} report(scores, self) return scores
def forward( self, wave: Tensor, local: Tensor, speaker_id: Tensor = None, ): batch_size = wave.shape[0] sample_size = self.model_config.sample_size latent_size = self.model_config.latent_size noise_level = self.noise_scheduler.sample_noise_level(num=batch_size) noise = self.predictor.generate_noise(*wave.shape) latent = None if sample_size <= 1: assert latent_size == 0 else: assert latent_size > 0 latent_list = [] for i_data in range(batch_size): latent = self.predictor.generate_noise(sample_size, latent_size, local.shape[2]) with torch.no_grad(): loss = self.one_forward( noise_level=noise_level[i_data:i_data + 1].expand( (sample_size, ) + noise_level.shape[1:]), noise=noise[i_data:i_data + 1].expand((sample_size, ) + noise.shape[1:]), wave=wave[i_data:i_data + 1].expand((sample_size, ) + wave.shape[1:]), latent=latent, local=local[i_data:i_data + 1].expand((sample_size, ) + local.shape[1:]), speaker_id=(speaker_id[i_data:i_data + 1].expand((sample_size, ) + speaker_id.shape[1:]) if speaker_id is not None else None), ) i_sample = loss.mean(1).argmax(0) latent_list.append(latent[i_sample]) latent = torch.stack(latent_list) loss = self.one_forward( noise_level=noise_level, noise=noise, wave=wave, latent=latent, local=local, speaker_id=speaker_id, ).mean() # report values = dict(loss=loss) if not self.training: values = {key: (l, batch_size) for key, l in values.items()} # add weight report(values, self) return loss
def forward( self, x: Tensor, x_ref1: Optional[Tensor], x_ref2: Optional[Tensor], z1: Optional[Tensor], z2: Optional[Tensor], prefix: str, ): assert (x_ref1 is None) != (z1 is None) pad = self.model_config.padding_length if z1 is not None: s1 = self.mapping_network(z1) s2 = self.mapping_network(z2) else: s1 = self.style_encoder(x_ref1) s2 = self.style_encoder(x_ref2) # adversarial loss y1 = self.style_transfer(x=x, s=s1) loss_adv = calc_adversarial_loss( x=self.discriminator(y1[:, pad:-pad]), is_real=True ) # style reconstruction loss s1_re = self.style_encoder(y1[:, pad:-pad]) loss_style = torch.mean(torch.abs(s1_re - s1)) # diversity sensitive loss y2 = self.style_transfer(x=x, s=s2) y2 = y2.detach() loss_diverse = -torch.mean(torch.abs(y1[:, pad:-pad] - y2[:, pad:-pad])) # cycle-consistency loss s_x = self.style_encoder(x[:, pad * 2 : -pad * 2]) x_re = self.style_transfer(y1, s_x) loss_cycle = torch.mean(torch.abs(x_re - x[:, pad * 2 : -pad * 2])) # identification loss x_id = self.style_transfer(x, s_x) loss_identify = torch.mean( torch.abs(x_id[:, pad:-pad] - x[:, pad * 2 : -pad * 2]) ) loss = ( loss_adv + self.model_config.style_reconstruction_weight * loss_style + self.model_config.diversity_sensitive_weight * loss_diverse + self.model_config.cycle_consistency_weight * loss_cycle + self.model_config.identification_weight * loss_identify ) # report values = { f"{prefix}/loss": loss, f"{prefix}/loss_adv": loss_adv, f"{prefix}/loss_style": loss_style, f"{prefix}/loss_diverse": loss_diverse, f"{prefix}/loss_cycle": loss_cycle, f"{prefix}/loss_identify": loss_identify, } if not self.training: weight = x.shape[0] values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def test_report_without_reporter(self): observer = object() pytorch_trainer.report({'x': 1}, observer)
def test_report_with_unregistered_observer(self): reporter = pytorch_trainer.Reporter() observer = object() with reporter: with self.assertRaises(KeyError): pytorch_trainer.report({'x': 1}, observer)
def forward(self, x, y): self.args.append((x, y)) pytorch_trainer.report({'loss': x.sum() + y.sum()}, self)
def __call__( self, f0: Sequence[Tensor], phoneme: Sequence[Tensor], phoneme_list: Sequence[Tensor], speaker_id: Optional[Sequence[Tensor]] = None, ): batch_size = len(f0) speaker_id = torch.stack(speaker_id) d = self.predictor( f0=[h[:-1] for h in f0], phoneme=[h[:-1] for h in phoneme], phoneme_list=phoneme_list, speaker_id=speaker_id, ) output_f0 = d["f0"] output_phoneme = d["phoneme"] output_vuv = d["vuv"] output_stop = d["stop"] stacked_f0 = torch.cat(f0) stacked_phoneme = torch.cat(phoneme) stacked_vuv = stacked_f0 != 0 stacked_stop = torch.zeros_like(stacked_f0, dtype=torch.bool) stacked_stop[numpy.cumsum([h.shape[0] for h in f0]) - 1] = True # loss loss_f0 = F.l1_loss(output_f0[stacked_vuv], stacked_f0[stacked_vuv]) loss_phoneme = F.cross_entropy(output_phoneme, stacked_phoneme) loss_vuv = F.binary_cross_entropy_with_logits( output_vuv, stacked_vuv.to(torch.float32) ) loss_stop = F.binary_cross_entropy_with_logits( output_stop, stacked_stop.to(torch.float32), pos_weight=torch.ones_like(output_stop) * 10, ) loss_f0 = loss_f0 * self.model_config.f0_loss_weight loss_phoneme = loss_phoneme * self.model_config.phoneme_loss_weight loss_vuv = loss_vuv * self.model_config.vuv_loss_weight loss_stop = loss_stop * self.model_config.stop_loss_weight loss = loss_f0 + loss_phoneme + loss_vuv + loss_stop # metric accuracy_phoneme = accuracy(output_phoneme, stacked_phoneme) accuracy_vuv = accuracy(output_vuv, stacked_vuv) accuracy_stop = accuracy(output_stop, stacked_stop) # report values = dict( loss=loss, loss_f0=loss_f0, loss_phoneme=loss_phoneme, loss_vuv=loss_vuv, loss_stop=loss_stop, accuracy_phoneme=accuracy_phoneme, accuracy_vuv=accuracy_vuv, accuracy_stop=accuracy_stop, ) if not self.training: weight = batch_size values = {key: (l, weight) for key, l in values.items()} # add weight report(values, self) return loss
def forward( self, wave: Tensor, f0: Tensor, phoneme: Tensor, padded: Tensor, speaker: Optional[Tensor] = None, ): batch_size = wave.shape[0] length = f0.shape[1] mask = ~padded voiced = f0 != 0 long_voiced = voiced.long() features = self.predictor(wave, return_with_splited=True) feature = features["feature"].transpose(1, 2).reshape( batch_size * length, -1) voiced_feature = (features["voiced"].transpose(1, 2).reshape( batch_size * length, -1)) f0_feature = features["f0"].transpose(1, 2) phoneme_feature = (features["phoneme"].transpose(1, 2).reshape( batch_size * length, -1)) voiced_output = self.voiced_network(voiced_feature) if self.training: phoneme_output = self.phoneme_network(phoneme_feature, phoneme) else: phoneme_output = self.phoneme_network(phoneme_feature) voiced_loss = F.cross_entropy( voiced_output, long_voiced.reshape(-1), reduction="none")[mask.reshape(-1)].mean() phoneme_loss = F.cross_entropy( phoneme_output, phoneme.reshape(-1), reduction="none")[mask.reshape(-1)].mean() f0_output = self.f0_network(x=f0_feature) f0_loss = F.l1_loss(f0_output[voiced], f0[voiced], reduction="none")[mask[voiced]].mean() if speaker is not None: expanded_speaker = speaker.unsqueeze(1).expand(batch_size, length) speaker_output = self.speaker_network(feature.detach()) speaker_loss = F.cross_entropy(speaker_output, expanded_speaker.reshape(-1)) speaker_accuracy = accuracy(speaker_output, expanded_speaker.reshape(-1)) else: speaker_loss = 0 speaker_accuracy = 0 predictor_loss = (self.config.voiced_loss_weight * voiced_loss + self.config.f0_loss_weight * f0_loss + self.config.phoneme_loss_weight * phoneme_loss) loss = predictor_loss + speaker_loss # report values = dict( loss=loss, predictor_loss=predictor_loss, voiced_loss=voiced_loss, f0_loss=f0_loss, phoneme_loss=phoneme_loss, speaker_loss=speaker_loss, voiced_accuracy=accuracy(voiced_output, long_voiced.reshape(-1), mask=mask.reshape(-1)), phoneme_accuracy=accuracy(phoneme_output, phoneme.reshape(-1), mask=mask.reshape(-1)), speaker_accuracy=speaker_accuracy, ) if not self.training: values = {key: (l, batch_size) for key, l in values.items()} # add weight report(values, self) return loss