class WaveGlowModel(GlowVocoder, Exportable): """Waveglow model used to convert betweeen spectrograms and audio""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(WaveglowConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError(f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig") # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.sigma = self._cfg.sigma self.audio_to_melspec_precessor = instantiate(self._cfg.preprocessor) self.waveglow = instantiate(self._cfg.waveglow) self.loss = WaveGlowLoss() @GlowVocoder.mode.setter def mode(self, new_mode): if new_mode == OperationMode.training: self.train() else: self.eval() self._mode = new_mode self.waveglow.mode = new_mode @property def input_types(self): return { "audio": NeuralType(('B', 'T'), AudioSignal()), "audio_len": NeuralType(('B'), LengthsType()), "run_inverse": NeuralType(optional=True), } @property def output_types(self): if self.mode == OperationMode.training or self.mode == OperationMode.validation: output_dict = { "pred_normal_dist": NeuralType(('B', 'flowgroup', 'T'), NormalDistributionSamplesType()), "log_s_list": [NeuralType(('B', 'flowgroup', 'T'), VoidType())], # TODO: Figure out a good typing "log_det_W_list": [NeuralType(elements_type=LogDeterminantType())], } if self.mode == OperationMode.validation: output_dict["audio_pred"] = NeuralType(('B', 'T'), AudioSignal()) output_dict["spec"] = NeuralType(('B', 'T', 'D'), MelSpectrogramType()) output_dict["spec_len"] = NeuralType(('B'), LengthsType()) return output_dict return { "audio_pred": NeuralType(('B', 'T'), AudioSignal()), } @typecheck() def forward(self, *, audio, audio_len, run_inverse=True): if self.mode != self.waveglow.mode: raise ValueError( f"WaveGlowModel's mode {self.mode} does not match WaveGlowModule's mode {self.waveglow.mode}" ) spec, spec_len = self.audio_to_melspec_precessor(audio, audio_len) tensors = self.waveglow(spec=spec, audio=audio, run_inverse=run_inverse, sigma=self.sigma) if self.mode == OperationMode.training: return tensors[:-1] # z, log_s_list, log_det_W_list elif self.mode == OperationMode.validation: z, log_s_list, log_det_W_list, audio_pred = tensors return z, log_s_list, log_det_W_list, audio_pred, spec, spec_len return tensors # audio_pred @typecheck( input_types={ "spec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "sigma": NeuralType(optional=True), "denoise": NeuralType(optional=True), "denoiser_strength": NeuralType(optional=True), }, output_types={"audio": NeuralType(('B', 'T'), AudioSignal())}, ) def convert_spectrogram_to_audio( self, spec: torch.Tensor, sigma: float = 1.0, denoise: bool = True, denoiser_strength: float = 0.01 ) -> torch.Tensor: with self.nemo_infer(): self.waveglow.remove_weightnorm() audio = self.waveglow( spec=spec.to(self.waveglow.upsample.weight.dtype), run_inverse=True, audio=None, sigma=sigma ) if denoise: audio = self.denoise(audio, denoiser_strength) return audio def training_step(self, batch, batch_idx): self.mode = OperationMode.training audio, audio_len = batch z, log_s_list, log_det_W_list = self(audio=audio, audio_len=audio_len, run_inverse=False) loss = self.loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list, sigma=self.sigma) output = { 'loss': loss, 'progress_bar': {'training_loss': loss}, 'log': {'loss': loss}, } return output def validation_step(self, batch, batch_idx): self.mode = OperationMode.validation audio, audio_len = batch z, log_s_list, log_det_W_list, audio_pred, spec, spec_len = self( audio=audio, audio_len=audio_len, run_inverse=(batch_idx == 0) ) loss = self.loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list, sigma=self.sigma) return { "val_loss": loss, "audio_pred": audio_pred, "mel_target": spec, "mel_len": spec_len, } def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: tb_logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): tb_logger = logger.experiment break waveglow_log_to_tb_func( tb_logger, outputs[0].values(), self.global_step, tag="eval", mel_fb=self.audio_to_melspec_precessor.fb, ) avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() self.log('val_loss', avg_loss) def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance(cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True" ) with open_dict(cfg["dataloader_params"]): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error(f"The {name} dataloader for {self} has shuffle set to False!!!") elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error(f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate(cfg.dataset) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config(cfg, shuffle_should_be=False, name="validation") @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="WaveGlow-22050Hz", location="https://api.ngc.nvidia.com/v2/models/nvidia/nemottsmodels/versions/1.0.0a5/files/WaveGlow-22050Hz.nemo", description="This model is trained on LJSpeech sampled at 22050Hz, and can be used as an universal vocoder.", class_=cls, ) list_of_models.append(model) return list_of_models @property def input_module(self): return self.waveglow @property def output_module(self): return self.waveglow def _prepare_for_export(self): self.update_bias_spect() self.waveglow._prepare_for_export() def forward_for_export(self, spec, z=None): return self.waveglow(spec, z)
def output_types(self): return {"mel_spec": NeuralType(('B', 'T', 'C'), MelSpectrogramType())}
def output_types(self): return { "out": NeuralType(('B', 'T', 'D'), EncodedRepresentation()), "mask": NeuralType(('B', 'T', 'D'), MaskType()), }
def input_types(self): return { "enc": NeuralType(('B', 'T', 'D'), EncodedRepresentation()), "enc_mask": NeuralType(('B', 'T', 1), TokenDurationType()), }
def input_types(self): return { "audio": NeuralType(('B', 'T'), AudioSignal()), "audio_len": NeuralType(('B'), LengthsType()), }
def input_types(self): return { "spect_predicted": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spect_tgt": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), }
def output_types(self): return { "loss": NeuralType(elements_type=LossType()), }
class MixerTTSModel(SpectrogramGenerator, Exportable): """Mixer-TTS and Mixer-TTS-X models (https://arxiv.org/abs/2110.03584) that is used to generate mel spectrogram from text.""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) # Setup tokenizer self.tokenizer = None self._setup_tokenizer(cfg) assert self.tokenizer is not None num_tokens = len(self.tokenizer.tokens) self.tokenizer_pad = self.tokenizer.pad self.tokenizer_unk = self.tokenizer.oov super().__init__(cfg=cfg, trainer=trainer) self.pitch_loss_scale = cfg.pitch_loss_scale self.durs_loss_scale = cfg.durs_loss_scale self.mel_loss_scale = cfg.mel_loss_scale self.aligner = instantiate(cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.add_bin_loss = False self.bin_loss_scale = 0.0 self.bin_loss_start_ratio = cfg.bin_loss_start_ratio self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs self.cond_on_lm_embeddings = cfg.get("cond_on_lm_embeddings", False) if self.cond_on_lm_embeddings: self.lm_padding_value = (self._train_dl.dataset.lm_padding_value if self._train_dl is not None else self._get_lm_padding_value(cfg.lm_model)) self.lm_embeddings = self._get_lm_embeddings(cfg.lm_model) self.lm_embeddings.weight.requires_grad = False self.self_attention_module = instantiate( cfg.self_attention_module, n_lm_tokens_channels=self.lm_embeddings.weight.shape[1]) self.encoder = instantiate(cfg.encoder, num_tokens=num_tokens, padding_idx=self.tokenizer_pad) self.symbol_emb = self.encoder.to_embed self.duration_predictor = instantiate(cfg.duration_predictor) self.pitch_mean, self.pitch_std = float(cfg.pitch_mean), float( cfg.pitch_std) self.pitch_predictor = instantiate(cfg.pitch_predictor) self.pitch_emb = instantiate(cfg.pitch_emb) self.preprocessor = instantiate(cfg.preprocessor) self.decoder = instantiate(cfg.decoder) self.proj = nn.Linear(self.decoder.d_model, cfg.n_mel_channels) def _setup_normalizer(self, cfg): if "text_normalizer" in cfg: normalizer_kwargs = {} if "whitelist" in cfg.text_normalizer: normalizer_kwargs["whitelist"] = self.register_artifact( 'text_normalizer.whitelist', cfg.text_normalizer.whitelist) self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) self.text_normalizer_call = self.normalizer.normalize if "text_normalizer_call_kwargs" in cfg: self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} if "g2p" in cfg.text_tokenizer: g2p_kwargs = {} if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, ) text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) self.tokenizer = instantiate(cfg.text_tokenizer, **text_tokenizer_kwargs) def _get_lm_model_tokenizer(self, lm_model="albert"): if getattr(self, "_lm_model_tokenizer", None) is not None: return self._lm_model_tokenizer if self._train_dl is not None and self._train_dl.dataset is not None: self._lm_model_tokenizer = self._train_dl.dataset.lm_model_tokenizer if lm_model == "albert": self._lm_model_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2') else: raise NotImplementedError( f"{lm_model} lm model is not supported. Only albert is supported at this moment." ) return self._lm_model_tokenizer def _get_lm_embeddings(self, lm_model="albert"): if lm_model == "albert": return transformers.AlbertModel.from_pretrained( 'albert-base-v2').embeddings.word_embeddings else: raise NotImplementedError( f"{lm_model} lm model is not supported. Only albert is supported at this moment." ) def _get_lm_padding_value(self, lm_model="albert"): if lm_model == "albert": return transformers.AlbertTokenizer.from_pretrained( 'albert-base-v2')._convert_token_to_id('<pad>') else: raise NotImplementedError( f"{lm_model} lm model is not supported. Only albert is supported at this moment." ) def _metrics( self, true_durs, true_text_len, pred_durs, true_pitch, pred_pitch, true_spect=None, pred_spect=None, true_spect_len=None, attn_logprob=None, attn_soft=None, attn_hard=None, attn_hard_dur=None, ): text_mask = get_mask_from_lengths(true_text_len) mel_mask = get_mask_from_lengths(true_spect_len) loss = 0.0 # Dur loss and metrics durs_loss = F.mse_loss(pred_durs, (true_durs + 1).float().log(), reduction='none') durs_loss = durs_loss * text_mask.float() durs_loss = durs_loss.sum() / text_mask.sum() durs_pred = pred_durs.exp() - 1 durs_pred = torch.clamp_min(durs_pred, min=0) durs_pred = durs_pred.round().long() acc = ((true_durs == durs_pred) * text_mask).sum().float() / text_mask.sum() * 100 acc_dist_1 = (((true_durs - durs_pred).abs() <= 1) * text_mask).sum().float() / text_mask.sum() * 100 acc_dist_3 = (((true_durs - durs_pred).abs() <= 3) * text_mask).sum().float() / text_mask.sum() * 100 pred_spect = pred_spect.transpose(1, 2) # Mel loss mel_loss = F.mse_loss(pred_spect, true_spect, reduction='none').mean(dim=-2) mel_loss = mel_loss * mel_mask.float() mel_loss = mel_loss.sum() / mel_mask.sum() loss = loss + self.durs_loss_scale * durs_loss + self.mel_loss_scale * mel_loss # Aligner loss bin_loss, ctc_loss = None, None ctc_loss = self.forward_sum_loss(attn_logprob=attn_logprob, in_lens=true_text_len, out_lens=true_spect_len) loss = loss + ctc_loss if self.add_bin_loss: bin_loss = self.bin_loss(hard_attention=attn_hard, soft_attention=attn_soft) loss = loss + self.bin_loss_scale * bin_loss true_avg_pitch = average_pitch(true_pitch.unsqueeze(1), attn_hard_dur).squeeze(1) # Pitch loss pitch_loss = F.mse_loss(pred_pitch, true_avg_pitch, reduction='none') # noqa pitch_loss = (pitch_loss * text_mask).sum() / text_mask.sum() loss = loss + self.pitch_loss_scale * pitch_loss return loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss @torch.jit.unused def run_aligner(self, text, text_len, text_mask, spect, spect_len, attn_prior): text_emb = self.symbol_emb(text) attn_soft, attn_logprob = self.aligner( spect, text_emb.permute(0, 2, 1), mask=text_mask == 0, attn_prior=attn_prior, ) attn_hard = binarize_attention_parallel(attn_soft, text_len, spect_len) attn_hard_dur = attn_hard.sum(2)[:, 0, :] assert torch.all(torch.eq(attn_hard_dur.sum(dim=1), spect_len)) return attn_soft, attn_logprob, attn_hard, attn_hard_dur @typecheck( input_types={ "text": NeuralType(('B', 'T_text'), TokenIndex()), "text_len": NeuralType(('B', ), LengthsType()), "pitch": NeuralType(('B', 'T_audio'), RegressionValuesType(), optional=True), "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), "spect_len": NeuralType(('B', ), LengthsType(), optional=True), "attn_prior": NeuralType(('B', 'T_spec', 'T_text'), ProbsType(), optional=True), "lm_tokens": NeuralType(('B', 'T_lm_tokens'), TokenIndex(), optional=True), }, output_types={ "pred_spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), "durs_predicted": NeuralType(('B', 'T_text'), TokenDurationType()), "log_durs_predicted": NeuralType(('B', 'T_text'), TokenLogDurationType()), "pitch_predicted": NeuralType(('B', 'T_text'), RegressionValuesType()), "attn_soft": NeuralType(('B', 'S', 'T_spec', 'T_text'), ProbsType()), "attn_logprob": NeuralType(('B', 'S', 'T_spec', 'T_text'), LogprobsType()), "attn_hard": NeuralType(('B', 'S', 'T_spec', 'T_text'), ProbsType()), "attn_hard_dur": NeuralType(('B', 'T_text'), TokenDurationType()), }, ) def forward(self, text, text_len, pitch=None, spect=None, spect_len=None, attn_prior=None, lm_tokens=None): if self.training: assert pitch is not None text_mask = get_mask_from_lengths(text_len).unsqueeze(2) enc_out, enc_mask = self.encoder(text, text_mask) # Aligner attn_soft, attn_logprob, attn_hard, attn_hard_dur = None, None, None, None if spect is not None: attn_soft, attn_logprob, attn_hard, attn_hard_dur = self.run_aligner( text, text_len, text_mask, spect, spect_len, attn_prior) if self.cond_on_lm_embeddings: lm_emb = self.lm_embeddings(lm_tokens) lm_features = self.self_attention_module( enc_out, lm_emb, lm_emb, q_mask=enc_mask.squeeze(2), kv_mask=lm_tokens != self.lm_padding_value) # Duration predictor log_durs_predicted = self.duration_predictor(enc_out, enc_mask) durs_predicted = torch.clamp(log_durs_predicted.exp() - 1, 0) # Pitch predictor pitch_predicted = self.pitch_predictor(enc_out, enc_mask) # Avg pitch, add pitch_emb if not self.training: if pitch is not None: pitch = average_pitch(pitch.unsqueeze(1), attn_hard_dur).squeeze(1) pitch_emb = self.pitch_emb(pitch.unsqueeze(1)) else: pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1)) else: pitch = average_pitch(pitch.unsqueeze(1), attn_hard_dur).squeeze(1) pitch_emb = self.pitch_emb(pitch.unsqueeze(1)) enc_out = enc_out + pitch_emb.transpose(1, 2) if self.cond_on_lm_embeddings: enc_out = enc_out + lm_features # Regulate length len_regulated_enc_out, dec_lens = regulate_len(attn_hard_dur, enc_out) dec_out, dec_lens = self.decoder( len_regulated_enc_out, get_mask_from_lengths(dec_lens).unsqueeze(2)) pred_spect = self.proj(dec_out) return ( pred_spect, durs_predicted, log_durs_predicted, pitch_predicted, attn_soft, attn_logprob, attn_hard, attn_hard_dur, ) def infer( self, text, text_len=None, text_mask=None, spect=None, spect_len=None, attn_prior=None, use_gt_durs=False, lm_tokens=None, pitch=None, ): if text_mask is None: text_mask = get_mask_from_lengths(text_len).unsqueeze(2) enc_out, enc_mask = self.encoder(text, text_mask) # Aligner attn_hard_dur = None if use_gt_durs: attn_soft, attn_logprob, attn_hard, attn_hard_dur = self.run_aligner( text, text_len, text_mask, spect, spect_len, attn_prior) if self.cond_on_lm_embeddings: lm_emb = self.lm_embeddings(lm_tokens) lm_features = self.self_attention_module( enc_out, lm_emb, lm_emb, q_mask=enc_mask.squeeze(2), kv_mask=lm_tokens != self.lm_padding_value) # Duration predictor log_durs_predicted = self.duration_predictor(enc_out, enc_mask) durs_predicted = torch.clamp(log_durs_predicted.exp() - 1, 0) # Avg pitch, pitch predictor if use_gt_durs and pitch is not None: pitch = average_pitch(pitch.unsqueeze(1), attn_hard_dur).squeeze(1) pitch_emb = self.pitch_emb(pitch.unsqueeze(1)) else: pitch_predicted = self.pitch_predictor(enc_out, enc_mask) pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1)) # Add pitch emb enc_out = enc_out + pitch_emb.transpose(1, 2) if self.cond_on_lm_embeddings: enc_out = enc_out + lm_features if use_gt_durs: if attn_hard_dur is not None: len_regulated_enc_out, dec_lens = regulate_len( attn_hard_dur, enc_out) else: raise NotImplementedError else: len_regulated_enc_out, dec_lens = regulate_len( durs_predicted, enc_out) dec_out, _ = self.decoder(len_regulated_enc_out, get_mask_from_lengths(dec_lens).unsqueeze(2)) pred_spect = self.proj(dec_out) return pred_spect def on_train_epoch_start(self): bin_loss_start_epoch = np.ceil(self.bin_loss_start_ratio * self._trainer.max_epochs) # Add bin loss when current_epoch >= bin_start_epoch if not self.add_bin_loss and self.current_epoch >= bin_loss_start_epoch: logging.info( f"Using hard attentions after epoch: {self.current_epoch}") self.add_bin_loss = True if self.add_bin_loss: self.bin_loss_scale = min( (self.current_epoch - bin_loss_start_epoch) / self.bin_loss_warmup_epochs, 1.0) def training_step(self, batch, batch_idx): attn_prior, lm_tokens = None, None if self.cond_on_lm_embeddings: audio, audio_len, text, text_len, attn_prior, pitch, _, lm_tokens = batch else: audio, audio_len, text, text_len, attn_prior, pitch, _ = batch spect, spect_len = self.preprocessor(input_signal=audio, length=audio_len) # pitch normalization zero_pitch_idx = pitch == 0 pitch = (pitch - self.pitch_mean) / self.pitch_std pitch[zero_pitch_idx] = 0.0 ( pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur, ) = self( text=text, text_len=text_len, pitch=pitch, spect=spect, spect_len=spect_len, attn_prior=attn_prior, lm_tokens=lm_tokens, ) ( loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss, ) = self._metrics( pred_durs=pred_log_durs, pred_pitch=pred_pitch, true_durs=attn_hard_dur, true_text_len=text_len, true_pitch=pitch, true_spect=spect, pred_spect=pred_spect, true_spect_len=spect_len, attn_logprob=attn_logprob, attn_soft=attn_soft, attn_hard=attn_hard, attn_hard_dur=attn_hard_dur, ) train_log = { 'train_loss': loss, 'train_durs_loss': durs_loss, 'train_pitch_loss': torch.tensor(1.0).to(durs_loss.device) if pitch_loss is None else pitch_loss, 'train_mel_loss': mel_loss, 'train_durs_acc': acc, 'train_durs_acc_dist_3': acc_dist_3, 'train_ctc_loss': torch.tensor(1.0).to(durs_loss.device) if ctc_loss is None else ctc_loss, 'train_bin_loss': torch.tensor(1.0).to(durs_loss.device) if bin_loss is None else bin_loss, } return { 'loss': loss, 'progress_bar': {k: v.detach() for k, v in train_log.items()}, 'log': train_log } def validation_step(self, batch, batch_idx): attn_prior, lm_tokens = None, None if self.cond_on_lm_embeddings: audio, audio_len, text, text_len, attn_prior, pitch, _, lm_tokens = batch else: audio, audio_len, text, text_len, attn_prior, pitch, _ = batch spect, spect_len = self.preprocessor(input_signal=audio, length=audio_len) # pitch normalization zero_pitch_idx = pitch == 0 pitch = (pitch - self.pitch_mean) / self.pitch_std pitch[zero_pitch_idx] = 0.0 ( pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur, ) = self( text=text, text_len=text_len, pitch=pitch, spect=spect, spect_len=spect_len, attn_prior=attn_prior, lm_tokens=lm_tokens, ) ( loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss, ) = self._metrics( pred_durs=pred_log_durs, pred_pitch=pred_pitch, true_durs=attn_hard_dur, true_text_len=text_len, true_pitch=pitch, true_spect=spect, pred_spect=pred_spect, true_spect_len=spect_len, attn_logprob=attn_logprob, attn_soft=attn_soft, attn_hard=attn_hard, attn_hard_dur=attn_hard_dur, ) # without ground truth internal features except for durations pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur = self( text=text, text_len=text_len, pitch=None, spect=spect, spect_len=spect_len, attn_prior=attn_prior, lm_tokens=lm_tokens, ) *_, with_pred_features_mel_loss, _, _ = self._metrics( pred_durs=pred_log_durs, pred_pitch=pred_pitch, true_durs=attn_hard_dur, true_text_len=text_len, true_pitch=pitch, true_spect=spect, pred_spect=pred_spect, true_spect_len=spect_len, attn_logprob=attn_logprob, attn_soft=attn_soft, attn_hard=attn_hard, attn_hard_dur=attn_hard_dur, ) val_log = { 'val_loss': loss, 'val_durs_loss': durs_loss, 'val_pitch_loss': torch.tensor(1.0).to(durs_loss.device) if pitch_loss is None else pitch_loss, 'val_mel_loss': mel_loss, 'val_with_pred_features_mel_loss': with_pred_features_mel_loss, 'val_durs_acc': acc, 'val_durs_acc_dist_3': acc_dist_3, 'val_ctc_loss': torch.tensor(1.0).to(durs_loss.device) if ctc_loss is None else ctc_loss, 'val_bin_loss': torch.tensor(1.0).to(durs_loss.device) if bin_loss is None else bin_loss, } self.log_dict(val_log, prog_bar=False, on_epoch=True, logger=True, sync_dist=True) if batch_idx == 0 and self.current_epoch % 5 == 0 and isinstance( self.logger, WandbLogger): specs = [] pitches = [] for i in range(min(3, spect.shape[0])): specs += [ wandb.Image( plot_spectrogram_to_numpy( spect[i, :, :spect_len[i]].data.cpu().numpy()), caption=f"gt mel {i}", ), wandb.Image( plot_spectrogram_to_numpy( pred_spect.transpose( 1, 2)[i, :, :spect_len[i]].data.cpu().numpy()), caption=f"pred mel {i}", ), ] pitches += [ wandb.Image( plot_pitch_to_numpy( average_pitch(pitch.unsqueeze(1), attn_hard_dur).squeeze(1) [i, :text_len[i]].data.cpu().numpy(), ylim_range=[-2.5, 2.5], ), caption=f"gt pitch {i}", ), ] pitches += [ wandb.Image( plot_pitch_to_numpy( pred_pitch[i, :text_len[i]].data.cpu().numpy(), ylim_range=[-2.5, 2.5]), caption=f"pred pitch {i}", ), ] self.logger.experiment.log({"specs": specs, "pitches": pitches}) @typecheck( input_types={ "tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True), "tokens_len": NeuralType(('B'), LengthsType(), optional=True), "lm_tokens": NeuralType(('B', 'T_lm_tokens'), TokenIndex(), optional=True), "raw_texts": [NeuralType(optional=True)], "lm_model": NeuralType(optional=True), }, output_types={ "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), }, ) def generate_spectrogram( self, tokens: Optional[torch.Tensor] = None, tokens_len: Optional[torch.Tensor] = None, lm_tokens: Optional[torch.Tensor] = None, raw_texts: Optional[List[str]] = None, norm_text_for_lm_model: bool = True, lm_model: str = "albert", ): if tokens is not None: if tokens_len is None: # It is assumed that padding is consecutive and only at the end tokens_len = (tokens != self.tokenizer.pad).sum(dim=-1) else: if raw_texts is None: raise ValueError( "raw_texts must be specified if tokens is None") t_seqs = [self.tokenizer(t) for t in raw_texts] tokens = torch.nn.utils.rnn.pad_sequence( sequences=[ torch.tensor(t, dtype=torch.long, device=self.device) for t in t_seqs ], batch_first=True, padding_value=self.tokenizer.pad, ) tokens_len = torch.tensor([len(t) for t in t_seqs], dtype=torch.long, device=tokens.device) if self.cond_on_lm_embeddings and lm_tokens is None: if raw_texts is None: raise ValueError( "raw_texts must be specified if lm_tokens is None") lm_model_tokenizer = self._get_lm_model_tokenizer(lm_model) lm_padding_value = lm_model_tokenizer._convert_token_to_id('<pad>') lm_space_value = lm_model_tokenizer._convert_token_to_id('▁') assert isinstance(self.tokenizer, EnglishCharsTokenizer) or isinstance( self.tokenizer, EnglishPhonemesTokenizer) if norm_text_for_lm_model and self.text_normalizer_call is not None: raw_texts = [ self.text_normalizer_call( t, **self.text_normalizer_call_kwargs) for t in raw_texts ] preprocess_texts_as_tts_input = [ self.tokenizer.text_preprocessing_func(t) for t in raw_texts ] lm_tokens_as_ids_list = [ lm_model_tokenizer.encode(t, add_special_tokens=False) for t in preprocess_texts_as_tts_input ] if self.tokenizer.pad_with_space: lm_tokens_as_ids_list = [[lm_space_value] + t + [lm_space_value] for t in lm_tokens_as_ids_list] lm_tokens = torch.full( (len(lm_tokens_as_ids_list), max([len(t) for t in lm_tokens_as_ids_list])), fill_value=lm_padding_value, device=tokens.device, ) for i, lm_tokens_i in enumerate(lm_tokens_as_ids_list): lm_tokens[i, :len(lm_tokens_i)] = torch.tensor( lm_tokens_i, device=tokens.device) pred_spect = self.infer(tokens, tokens_len, lm_tokens=lm_tokens).transpose(1, 2) return pred_spect def parse(self, text: str, normalize=True) -> torch.Tensor: if self.training: logging.warning("parse() is meant to be called in eval mode.") if normalize and self.text_normalizer_call is not None: text = self.text_normalizer_call( text, **self.text_normalizer_call_kwargs) eval_phon_mode = contextlib.nullcontext() if hasattr(self.tokenizer, "set_phone_prob"): eval_phon_mode = self.tokenizer.set_phone_prob(prob=1.0) with eval_phon_mode: tokens = self.tokenizer.encode(text) return torch.tensor(tokens).long().unsqueeze(0).to(self.device) def _loader(self, cfg): try: _ = cfg.dataset.manifest_filepath except omegaconf.errors.MissingMandatoryValue: logging.warning( "manifest_filepath was skipped. No dataset for this model.") return None dataset = instantiate( cfg.dataset, text_normalizer=self.normalizer, text_normalizer_call_kwargs=self.text_normalizer_call_kwargs, text_tokenizer=self.tokenizer, ) return torch.utils.data.DataLoader( # noqa dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params, ) def setup_training_data(self, cfg): self._train_dl = self._loader(cfg) def setup_validation_data(self, cfg): self._validation_dl = self._loader(cfg) def setup_test_data(self, cfg): """Omitted.""" pass @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_en_lj_mixertts", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_mixertts/versions/1.6.0/files/tts_en_lj_mixertts.nemo", description= "This model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent.", class_=cls, # noqa ) list_of_models.append(model) model = PretrainedModelInfo( pretrained_model_name="tts_en_lj_mixerttsx", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_mixerttsx/versions/1.6.0/files/tts_en_lj_mixerttsx.nemo", description= "This model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent.", class_=cls, # noqa ) list_of_models.append(model) return list_of_models # Methods for model exportability @property def input_types(self): return { "text": NeuralType(('B', 'T_text'), TokenIndex()), "lm_tokens": NeuralType(('B', 'T_lm_tokens'), TokenIndex(), optional=True), } @property def output_types(self): return { "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), } def input_example(self, max_text_len=10, max_lm_tokens_len=10): text = torch.randint( low=0, high=len(self.tokenizer.tokens), size=(1, max_text_len), device=self.device, dtype=torch.long, ) inputs = {'text': text} if self.cond_on_lm_embeddings: inputs['lm_tokens'] = torch.randint( low=0, high=self.lm_embeddings.weight.shape[0], size=(1, max_lm_tokens_len), device=self.device, dtype=torch.long, ) return (inputs, ) def forward_for_export(self, text, lm_tokens=None): text_mask = (text != self.tokenizer_pad).unsqueeze(2) spect = self.infer(text=text, text_mask=text_mask, lm_tokens=lm_tokens).transpose(1, 2) return spect.to(torch.float)
def output_types(self): return { "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), }
class UnivNetModel(Vocoder, Exportable): """UnivNet model (https://arxiv.org/abs/2106.07889) that is used to generate audio from mel spectrogram.""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) super().__init__(cfg=cfg, trainer=trainer) self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) # We use separate preprocessor for training, because we need to pass grads and remove pitch fmax limitation self.trg_melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.generator = instantiate( cfg.generator, n_mel_channels=cfg.preprocessor.nfilt, hop_length=cfg.preprocessor.n_window_stride) self.mpd = MultiPeriodDiscriminator( cfg.discriminator.mpd, debug=cfg.debug if "debug" in cfg else False) self.mrd = MultiResolutionDiscriminator( cfg.discriminator.mrd, debug=cfg.debug if "debug" in cfg else False) self.discriminator_loss = DiscriminatorLoss() self.generator_loss = GeneratorLoss() # Reshape MRD resolutions hyperparameter and apply them to MRSTFT loss self.stft_resolutions = cfg.discriminator.mrd.resolutions self.fft_sizes = [res[0] for res in self.stft_resolutions] self.hop_sizes = [res[1] for res in self.stft_resolutions] self.win_lengths = [res[2] for res in self.stft_resolutions] self.mrstft_loss = MultiResolutionSTFTLoss(self.fft_sizes, self.hop_sizes, self.win_lengths) self.stft_lamb = cfg.stft_lamb self.sample_rate = self._cfg.preprocessor.sample_rate self.stft_bias = None self.input_as_mel = False if self._train_dl: self.input_as_mel = self._train_dl.dataset.load_precomputed_mel self.automatic_optimization = False def _get_max_steps(self): return compute_max_steps( max_epochs=self._cfg.max_epochs, accumulate_grad_batches=self.trainer.accumulate_grad_batches, limit_train_batches=self.trainer.limit_train_batches, num_workers=get_num_workers(self.trainer), num_samples=len(self._train_dl.dataset), batch_size=get_batch_size(self._train_dl), drop_last=self._train_dl.drop_last, ) def configure_optimizers(self): optim_g = instantiate( self._cfg.optim, params=self.generator.parameters(), ) optim_d = instantiate( self._cfg.optim, params=itertools.chain(self.mrd.parameters(), self.mpd.parameters()), ) return [optim_g, optim_d] @typecheck() def forward(self, *, spec): """ Runs the generator, for inputs and outputs see input_types, and output_types """ return self.generator(x=spec) @typecheck( input_types={ "spec": NeuralType(('B', 'C', 'T'), MelSpectrogramType()) }, output_types={"audio": NeuralType(('B', 'T'), AudioSignal())}, ) def convert_spectrogram_to_audio(self, spec: 'torch.tensor') -> 'torch.tensor': return self(spec=spec).squeeze(1) def training_step(self, batch, batch_idx): if self.input_as_mel: # Pre-computed spectrograms will be used as input audio, audio_len, audio_mel = batch else: audio, audio_len = batch audio_mel, _ = self.audio_to_melspec_precessor(audio, audio_len) audio = audio.unsqueeze(1) audio_pred = self.generator(x=audio_mel) audio_pred_mel, _ = self.trg_melspec_fn(audio_pred.squeeze(1), audio_len) optim_g, optim_d = self.optimizers() # Train discriminator optim_d.zero_grad() mpd_score_real, mpd_score_gen, _, _ = self.mpd( y=audio, y_hat=audio_pred.detach()) loss_disc_mpd, _, _ = self.discriminator_loss( disc_real_outputs=mpd_score_real, disc_generated_outputs=mpd_score_gen) mrd_score_real, mrd_score_gen, _, _ = self.mrd( y=audio, y_hat=audio_pred.detach()) loss_disc_mrd, _, _ = self.discriminator_loss( disc_real_outputs=mrd_score_real, disc_generated_outputs=mrd_score_gen) loss_d = loss_disc_mrd + loss_disc_mpd self.manual_backward(loss_d) optim_d.step() # Train generator optim_g.zero_grad() loss_sc, loss_mag = self.mrstft_loss(x=audio_pred.squeeze(1), y=audio.squeeze(1), input_lengths=audio_len) loss_sc = torch.stack(loss_sc).mean() loss_mag = torch.stack(loss_mag).mean() loss_mrstft = (loss_sc + loss_mag) * self.stft_lamb _, mpd_score_gen, _, _ = self.mpd(y=audio, y_hat=audio_pred) _, mrd_score_gen, _, _ = self.mrd(y=audio, y_hat=audio_pred) loss_gen_mpd, _ = self.generator_loss(disc_outputs=mpd_score_gen) loss_gen_mrd, _ = self.generator_loss(disc_outputs=mrd_score_gen) loss_g = loss_gen_mrd + loss_gen_mpd + loss_mrstft self.manual_backward(loss_g) optim_g.step() metrics = { "g_loss_sc": loss_sc, "g_loss_mag": loss_mag, "g_loss_mrstft": loss_mrstft, "g_loss_gen_mpd": loss_gen_mpd, "g_loss_gen_mrd": loss_gen_mrd, "g_loss": loss_g, "d_loss_mpd": loss_disc_mpd, "d_loss_mrd": loss_disc_mrd, "d_loss": loss_d, "global_step": self.global_step, "lr": optim_g.param_groups[0]['lr'], } self.log_dict(metrics, on_step=True, sync_dist=True) self.log("g_mrstft_loss", loss_mrstft, prog_bar=True, logger=False, sync_dist=True) def validation_step(self, batch, batch_idx): if self.input_as_mel: audio, audio_len, audio_mel = batch audio_mel_len = [audio_mel.shape[1]] * audio_mel.shape[0] else: audio, audio_len = batch audio_mel, audio_mel_len = self.audio_to_melspec_precessor( audio, audio_len) audio_pred = self(spec=audio_mel) # Perform bias denoising pred_denoised = self._bias_denoise(audio_pred, audio_mel).squeeze(1) pred_denoised_mel, _ = self.audio_to_melspec_precessor( pred_denoised, audio_len) if self.input_as_mel: gt_mel, gt_mel_len = self.audio_to_melspec_precessor( audio, audio_len) audio_pred_mel, _ = self.audio_to_melspec_precessor( audio_pred.squeeze(1), audio_len) loss_mel = F.l1_loss(audio_mel, audio_pred_mel) self.log_dict({"val_loss": loss_mel}, on_epoch=True, sync_dist=True) # Plot audio once per epoch if batch_idx == 0 and isinstance(self.logger, WandbLogger) and HAVE_WANDB: clips = [] specs = [] for i in range(min(5, audio.shape[0])): clips += [ wandb.Audio( audio[i, :audio_len[i]].data.cpu().numpy(), caption=f"real audio {i}", sample_rate=self.sample_rate, ), wandb.Audio( audio_pred[i, 0, :audio_len[i]].data.cpu().numpy().astype( 'float32'), caption=f"generated audio {i}", sample_rate=self.sample_rate, ), wandb.Audio( pred_denoised[i, :audio_len[i]].data.cpu().numpy(), caption=f"denoised audio {i}", sample_rate=self.sample_rate, ), ] specs += [ wandb.Image( plot_spectrogram_to_numpy(audio_mel[ i, :, :audio_mel_len[i]].data.cpu().numpy()), caption=f"input mel {i}", ), wandb.Image( plot_spectrogram_to_numpy(audio_pred_mel[ i, :, :audio_mel_len[i]].data.cpu().numpy()), caption=f"output mel {i}", ), wandb.Image( plot_spectrogram_to_numpy(pred_denoised_mel[ i, :, :audio_mel_len[i]].data.cpu().numpy()), caption=f"denoised mel {i}", ), ] if self.input_as_mel: specs += [ wandb.Image( plot_spectrogram_to_numpy(gt_mel[ i, :, :audio_mel_len[i]].data.cpu().numpy()), caption=f"gt mel {i}", ), ] self.logger.experiment.log({"audio": clips, "specs": specs}) def _bias_denoise(self, audio, mel): def stft(x): comp = torch.stft(x.squeeze(1), n_fft=1024, hop_length=256, win_length=1024) real, imag = comp[..., 0], comp[..., 1] mags = torch.sqrt(real**2 + imag**2) phase = torch.atan2(imag, real) return mags, phase def istft(mags, phase): comp = torch.stack( [mags * torch.cos(phase), mags * torch.sin(phase)], dim=-1) x = torch.istft(comp, n_fft=1024, hop_length=256, win_length=1024) return x # Create bias tensor if self.stft_bias is None or self.stft_bias.shape[0] != audio.shape[0]: audio_bias = self(spec=torch.zeros_like(mel, device=mel.device)) self.stft_bias, _ = stft(audio_bias) self.stft_bias = self.stft_bias[:, :, 0][:, :, None] audio_mags, audio_phase = stft(audio) audio_mags = audio_mags - self.cfg.get("denoise_strength", 0.0025) * self.stft_bias audio_mags = torch.clamp(audio_mags, 0.0) audio_denoised = istft(audio_mags, audio_phase).unsqueeze(1) return audio_denoised def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance( cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True") with open_dict(cfg["dataloader_params"]): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to False!!!" ) elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate(cfg.dataset) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config( cfg, shuffle_should_be=False, name="validation") def setup_test_data(self, cfg): pass @classmethod def list_available_models(cls) -> 'Optional[Dict[str, str]]': list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_en_lj_univnet", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_univnet/versions/1.7.0/files/tts_en_lj_univnet.nemo", description= "This model is trained on LJSpeech sampled at 22050Hz, and has been tested on generating female English voices with an American accent.", class_=cls, ) list_of_models.append(model) model = PretrainedModelInfo( pretrained_model_name="tts_en_libritts_univnet", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_libritts_univnet/versions/1.7.0/files/tts_en_libritts_multispeaker_univnet.nemo", description= "This model is trained on all LibriTTS training data (train-clean-100, train-clean-360, and train-other-500) sampled at 22050Hz, and has been tested on generating English voices.", class_=cls, ) list_of_models.append(model) return list_of_models # Methods for model exportability def _prepare_for_export(self, **kwargs): if self.generator is not None: try: self.generator.remove_weight_norm() except ValueError: return @property def input_types(self): return { "spec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), } @property def output_types(self): return { "audio": NeuralType(('B', 'S', 'T'), AudioSignal(self.sample_rate)), } def input_example(self, max_batch=1, max_dim=256): """ Generates input examples for tracing etc. Returns: A tuple of input examples. """ par = next(self.parameters()) mel = torch.randn( (max_batch, self.cfg['preprocessor']['nfilt'], max_dim), device=par.device, dtype=par.dtype) return ({'spec': mel}, ) def forward_for_export(self, spec): """ Runs the generator, for inputs and outputs see input_types, and output_types """ return self.generator(x=spec)
class FastSpeech2Model(SpectrogramGenerator): """FastSpeech 2 model used to convert from text (phonemes) to mel-spectrograms.""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastSpeech2Config) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.pitch = cfg.add_pitch_predictor self.energy = cfg.add_energy_predictor self.duration_coeff = cfg.duration_coeff self.audio_to_melspec_preprocessor = instantiate( self._cfg.preprocessor) self.encoder = instantiate(self._cfg.encoder) self.mel_decoder = instantiate(self._cfg.decoder) self.variance_adapter = instantiate(self._cfg.variance_adaptor) self.loss = L2MelLoss() self.mseloss = torch.nn.MSELoss() self.durationloss = DurationLoss() self.log_train_images = False # Parser and mappings are used for inference only. self.parser = parsers.make_parser(name='en') if 'mappings_filepath' in cfg: mappings_filepath = cfg.get('mappings_filepath') else: logging.error( "ERROR: You must specify a mappings.json file in the config file under model.mappings_filepath." ) mappings_filepath = self.register_artifact('mappings_filepath', mappings_filepath) with open(mappings_filepath, 'r') as f: mappings = json.load(f) self.word2phones = mappings['word2phones'] self.phone2idx = mappings['phone2idx'] @typecheck( input_types={ "text": NeuralType(('B', 'T'), TokenIndex()), "text_length": NeuralType(('B'), LengthsType()), "spec_len": NeuralType(('B'), LengthsType(), optional=True), "durations": NeuralType(('B', 'T'), TokenDurationType(), optional=True), "pitch": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), "energies": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), }, output_types={ "mel_spec": NeuralType(('B', 'T', 'C'), MelSpectrogramType()), "log_dur_preds": NeuralType(('B', 'T'), TokenDurationType(), optional=True), "pitch_preds": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), "energy_preds": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), "encoded_text_mask": NeuralType(('B', 'T', 'D'), MaskType()), }, ) def forward(self, *, text, text_length, spec_len=None, durations=None, pitch=None, energies=None): encoded_text, encoded_text_mask = self.encoder(text=text, text_length=text_length) aligned_text, log_dur_preds, pitch_preds, energy_preds, spec_len = self.variance_adapter( x=encoded_text, x_len=text_length, dur_target=durations, pitch_target=pitch, energy_target=energies, spec_len=spec_len, ) mel = self.mel_decoder(decoder_input=aligned_text, lengths=spec_len) return mel, log_dur_preds, pitch_preds, energy_preds, encoded_text_mask def training_step(self, batch, batch_idx): f, fl, t, tl, durations, pitch, energies = batch spec, spec_len = self.audio_to_melspec_preprocessor(f, fl) mel, log_dur_preds, pitch_preds, energy_preds, encoded_text_mask = self( text=t, text_length=tl, spec_len=spec_len, durations=durations, pitch=pitch, energies=energies) total_loss = self.loss(spec_pred=mel.transpose(1, 2), spec_target=spec, spec_target_len=spec_len, pad_value=-11.52) self.log(name="train_mel_loss", value=total_loss.clone().detach()) # Duration prediction loss dur_loss = self.durationloss(log_duration_pred=log_dur_preds, duration_target=durations.float(), mask=encoded_text_mask) dur_loss *= self.duration_coeff self.log(name="train_dur_loss", value=dur_loss) total_loss += dur_loss # Pitch prediction loss if self.pitch: pitch_loss = self.mseloss(pitch_preds, pitch) total_loss += pitch_loss self.log(name="train_pitch_loss", value=pitch_loss) # Energy prediction loss if self.energy: energy_loss = self.mseloss(energy_preds, energies) total_loss += energy_loss self.log(name="train_energy_loss", value=energy_loss) self.log(name="train_loss", value=total_loss) return {"loss": total_loss, "outputs": [spec, mel]} def training_epoch_end(self, outputs): if self.log_train_images and self.logger is not None and self.logger.experiment is not None: tb_logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): tb_logger = logger.experiment break spec_target, spec_predict = outputs[0]["outputs"] tb_logger.add_image( "train_mel_target", plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), self.global_step, dataformats="HWC", ) spec_predict = spec_predict[0].data.cpu().numpy() tb_logger.add_image( "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict.T), self.global_step, dataformats="HWC", ) self.log_train_images = False return super().training_epoch_end(outputs) def validation_step(self, batch, batch_idx): f, fl, t, tl, _, _, _ = batch spec, spec_len = self.audio_to_melspec_preprocessor(f, fl) mel, _, _, _, _ = self(text=t, text_length=tl, spec_len=spec_len) loss = self.loss(spec_pred=mel.transpose(1, 2), spec_target=spec, spec_target_len=spec_len, pad_value=-11.52) return { "val_loss": loss, "mel_target": spec, "mel_pred": mel, } def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: tb_logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): tb_logger = logger.experiment break _, spec_target, spec_predict = outputs[0].values() tb_logger.add_image( "val_mel_target", plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), self.global_step, dataformats="HWC", ) spec_predict = spec_predict[0].data.cpu().numpy() tb_logger.add_image( "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict.T), self.global_step, dataformats="HWC", ) avg_loss = torch.stack([ x['val_loss'] for x in outputs ]).mean() # This reduces across batches, not workers! self.log('val_loss', avg_loss, sync_dist=True) self.log_train_images = True def parse(self, str_input: str, additional_word2phones=None) -> torch.tensor: """ Parses text input and converts them to phoneme indices. str_input (str): The input text to be converted. additional_word2phones (dict): Optional dictionary mapping words to phonemes for updating the model's word2phones. This will not overwrite the existing dictionary, just update it with OOV or new mappings. Defaults to None, which will keep the existing mapping. """ # Update model's word2phones if applicable if additional_word2phones is not None: self.word2phones.update(additional_word2phones) # Convert text -> normalized text -> list of phones per word -> indices if str_input[-1] not in [".", "!", "?"]: str_input = str_input + "." norm_text = re.findall(r"""[\w']+|[.,!?;"]""", self.parser._normalize(str_input)) try: phones = [self.word2phones[t] for t in norm_text] except KeyError as error: logging.error( f"ERROR: The following word in the input is not in the model's dictionary and could not be converted" f" to phonemes: ({error}).\n" f"You can pass in an `additional_word2phones` dictionary with a conversion for" f" this word, e.g. {{'{error}': \['phone1', 'phone2', ...\]}} to update the model's mapping." ) raise tokens = [] for phone_list in phones: inds = [self.phone2idx[p] for p in phone_list] tokens += inds x = torch.tensor(tokens).unsqueeze_(0).long().to(self.device) return x @typecheck(output_types={ "spect": NeuralType(('B', 'C', 'T'), MelSpectrogramType()) }) def generate_spectrogram(self, tokens: torch.Tensor) -> torch.Tensor: self.eval() token_len = torch.tensor([tokens.shape[1]]).to(self.device) spect, *_ = self(text=tokens, text_length=token_len) return spect.transpose(1, 2) def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance( cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True") with open_dict(cfg.dataloader_params): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to False!!!" ) elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate(cfg.dataset) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config( cfg, shuffle_should_be=False, name="validation") @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_en_fastspeech2", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastspeech_2/versions/1.0.0/files/tts_en_fastspeech2.nemo", description= "This model is trained on LJSpeech sampled at 22050Hz, and can be used to generate female English voices with an American accent.", class_=cls, aliases=["FastSpeech2-22050Hz"], ) list_of_models.append(model) return list_of_models
def output_types(self): return { "audio": NeuralType(('B', 'S', 'T'), AudioSignal(self.sample_rate)), }
class HifiGanModel(Vocoder, Exportable): def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) # use a different melspec extractor because: # 1. we need to pass grads # 2. we need remove fmax limitation self.trg_melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.generator = instantiate(cfg.generator) self.mpd = MultiPeriodDiscriminator(debug=cfg.debug if "debug" in cfg else False) self.msd = MultiScaleDiscriminator(debug=cfg.debug if "debug" in cfg else False) self.feature_loss = FeatureMatchingLoss() self.discriminator_loss = DiscriminatorLoss() self.generator_loss = GeneratorLoss() self.l1_factor = cfg.get("l1_loss_factor", 45) self.sample_rate = self._cfg.preprocessor.sample_rate self.stft_bias = None if self._train_dl and isinstance(self._train_dl.dataset, MelAudioDataset): self.input_as_mel = True else: self.input_as_mel = False self.automatic_optimization = False def configure_optimizers(self): self.optim_g = instantiate(self._cfg.optim, params=self.generator.parameters(),) self.optim_d = instantiate( self._cfg.optim, params=itertools.chain(self.msd.parameters(), self.mpd.parameters()), ) if hasattr(self._cfg, 'sched'): self.scheduler_g = CosineAnnealing( optimizer=self.optim_g, max_steps=self._cfg.max_steps, min_lr=self._cfg.sched.min_lr, warmup_steps=self._cfg.sched.warmup_ratio * self._cfg.max_steps, ) # Use warmup to delay start sch1_dict = { 'scheduler': self.scheduler_g, 'interval': 'step', } self.scheduler_d = CosineAnnealing( optimizer=self.optim_d, max_steps=self._cfg.max_steps, min_lr=self._cfg.sched.min_lr, ) sch2_dict = { 'scheduler': self.scheduler_d, 'interval': 'step', } return [self.optim_g, self.optim_d], [sch1_dict, sch2_dict] else: return [self.optim_g, self.optim_d] @property def input_types(self): return { "spec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), } @property def output_types(self): return { "audio": NeuralType(('B', 'S', 'T'), AudioSignal(self.sample_rate)), } @typecheck() def forward(self, *, spec): """ Runs the generator, for inputs and outputs see input_types, and output_types """ return self.generator(x=spec) @typecheck( input_types={"spec": NeuralType(('B', 'C', 'T'), MelSpectrogramType())}, output_types={"audio": NeuralType(('B', 'T'), AudioSignal())}, ) def convert_spectrogram_to_audio(self, spec: 'torch.tensor') -> 'torch.tensor': return self(spec=spec).squeeze(1) def training_step(self, batch, batch_idx): # if in finetune mode the mels are pre-computed using a # spectrogram generator if self.input_as_mel: audio, audio_len, audio_mel = batch # else, we compute the mel using the ground truth audio else: audio, audio_len = batch # mel as input for generator audio_mel, _ = self.audio_to_melspec_precessor(audio, audio_len) # mel as input for L1 mel loss audio_trg_mel, _ = self.trg_melspec_fn(audio, audio_len) audio = audio.unsqueeze(1) audio_pred = self.generator(x=audio_mel) audio_pred_mel, _ = self.trg_melspec_fn(audio_pred.squeeze(1), audio_len) # train discriminator self.optim_d.zero_grad() mpd_score_real, mpd_score_gen, _, _ = self.mpd(y=audio, y_hat=audio_pred.detach()) loss_disc_mpd, _, _ = self.discriminator_loss( disc_real_outputs=mpd_score_real, disc_generated_outputs=mpd_score_gen ) msd_score_real, msd_score_gen, _, _ = self.msd(y=audio, y_hat=audio_pred.detach()) loss_disc_msd, _, _ = self.discriminator_loss( disc_real_outputs=msd_score_real, disc_generated_outputs=msd_score_gen ) loss_d = loss_disc_msd + loss_disc_mpd self.manual_backward(loss_d) self.optim_d.step() # train generator self.optim_g.zero_grad() loss_mel = F.l1_loss(audio_pred_mel, audio_trg_mel) _, mpd_score_gen, fmap_mpd_real, fmap_mpd_gen = self.mpd(y=audio, y_hat=audio_pred) _, msd_score_gen, fmap_msd_real, fmap_msd_gen = self.msd(y=audio, y_hat=audio_pred) loss_fm_mpd = self.feature_loss(fmap_r=fmap_mpd_real, fmap_g=fmap_mpd_gen) loss_fm_msd = self.feature_loss(fmap_r=fmap_msd_real, fmap_g=fmap_msd_gen) loss_gen_mpd, _ = self.generator_loss(disc_outputs=mpd_score_gen) loss_gen_msd, _ = self.generator_loss(disc_outputs=msd_score_gen) loss_g = loss_gen_msd + loss_gen_mpd + loss_fm_msd + loss_fm_mpd + loss_mel * self.l1_factor self.manual_backward(loss_g) self.optim_g.step() # run schedulers schedulers = self.lr_schedulers() if schedulers is not None: sch1, sch2 = schedulers sch1.step() sch2.step() metrics = { "g_loss_fm_mpd": loss_fm_mpd, "g_loss_fm_msd": loss_fm_msd, "g_loss_gen_mpd": loss_gen_mpd, "g_loss_gen_msd": loss_gen_msd, "g_loss": loss_g, "d_loss_mpd": loss_disc_mpd, "d_loss_msd": loss_disc_msd, "d_loss": loss_d, "global_step": self.global_step, "lr": self.optim_g.param_groups[0]['lr'], } self.log_dict(metrics, on_step=True, sync_dist=True) self.log("g_l1_loss", loss_mel, prog_bar=True, logger=False, sync_dist=True) def validation_step(self, batch, batch_idx): if self.input_as_mel: audio, audio_len, audio_mel = batch audio_mel_len = [audio_mel.shape[1]] * audio_mel.shape[0] else: audio, audio_len = batch audio_mel, audio_mel_len = self.audio_to_melspec_precessor(audio, audio_len) audio_pred = self(spec=audio_mel) # perform bias denoising pred_denoised = self._bias_denoise(audio_pred, audio_mel).squeeze(1) pred_denoised_mel, _ = self.audio_to_melspec_precessor(pred_denoised, audio_len) if self.input_as_mel: gt_mel, gt_mel_len = self.audio_to_melspec_precessor(audio, audio_len) audio_pred_mel, _ = self.audio_to_melspec_precessor(audio_pred.squeeze(1), audio_len) loss_mel = F.l1_loss(audio_mel, audio_pred_mel) self.log_dict({"val_loss": loss_mel}, on_epoch=True, sync_dist=True) # plot audio once per epoch if batch_idx == 0 and isinstance(self.logger, WandbLogger) and HAVE_WANDB: clips = [] specs = [] for i in range(min(5, audio.shape[0])): clips += [ wandb.Audio( audio[i, : audio_len[i]].data.cpu().numpy(), caption=f"real audio {i}", sample_rate=self.sample_rate, ), wandb.Audio( audio_pred[i, 0, : audio_len[i]].data.cpu().numpy().astype('float32'), caption=f"generated audio {i}", sample_rate=self.sample_rate, ), wandb.Audio( pred_denoised[i, : audio_len[i]].data.cpu().numpy(), caption=f"denoised audio {i}", sample_rate=self.sample_rate, ), ] specs += [ wandb.Image( plot_spectrogram_to_numpy(audio_mel[i, :, : audio_mel_len[i]].data.cpu().numpy()), caption=f"input mel {i}", ), wandb.Image( plot_spectrogram_to_numpy(audio_pred_mel[i, :, : audio_mel_len[i]].data.cpu().numpy()), caption=f"output mel {i}", ), wandb.Image( plot_spectrogram_to_numpy(pred_denoised_mel[i, :, : audio_mel_len[i]].data.cpu().numpy()), caption=f"denoised mel {i}", ), ] if self.input_as_mel: specs += [ wandb.Image( plot_spectrogram_to_numpy(gt_mel[i, :, : audio_mel_len[i]].data.cpu().numpy()), caption=f"gt mel {i}", ), ] self.logger.experiment.log({"audio": clips, "specs": specs}) def _bias_denoise(self, audio, mel): def stft(x): comp = torch.stft(x.squeeze(1), n_fft=1024, hop_length=256, win_length=1024) real, imag = comp[..., 0], comp[..., 1] mags = torch.sqrt(real ** 2 + imag ** 2) phase = torch.atan2(imag, real) return mags, phase def istft(mags, phase): comp = torch.stack([mags * torch.cos(phase), mags * torch.sin(phase)], dim=-1) x = torch.istft(comp, n_fft=1024, hop_length=256, win_length=1024) return x # create bias tensor if self.stft_bias is None or self.stft_bias.shape[0] != audio.shape[0]: audio_bias = self(spec=torch.zeros_like(mel, device=mel.device)) self.stft_bias, _ = stft(audio_bias) self.stft_bias = self.stft_bias[:, :, 0][:, :, None] audio_mags, audio_phase = stft(audio) audio_mags = audio_mags - self.cfg.get("denoise_strength", 0.0025) * self.stft_bias audio_mags = torch.clamp(audio_mags, 0.0) audio_denoised = istft(audio_mags, audio_phase).unsqueeze(1) return audio_denoised def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance(cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True" ) with open_dict(cfg["dataloader_params"]): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error(f"The {name} dataloader for {self} has shuffle set to False!!!") elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error(f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate(cfg.dataset) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config(cfg, shuffle_should_be=False, name="validation") @classmethod def list_available_models(cls) -> 'Optional[Dict[str, str]]': list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_hifigan", location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_hifigan/versions/1.0.0rc1/files/tts_hifigan.nemo", description="This model is trained on LJSpeech audio sampled at 22050Hz and mel spectrograms generated from Tacotron2, TalkNet, and FastPitch. This model has been tested on generating female English voices with an American accent.", class_=cls, ) list_of_models.append(model) return list_of_models def load_state_dict(self, state_dict, strict=True): # override load_state_dict to give us some flexibility to be backward-compatible # with old checkpoints new_state_dict = {} num_resblocks = len(self.cfg['generator']['resblock_kernel_sizes']) for k, v in state_dict.items(): new_k = k if 'resblocks' in k: parts = k.split(".") # only do this is the checkpoint type is older if len(parts) == 6: layer = int(parts[2]) new_layer = f"{layer // num_resblocks}.{layer % num_resblocks}" new_k = f"generator.resblocks.{new_layer}.{'.'.join(parts[3:])}" new_state_dict[new_k] = v super().load_state_dict(new_state_dict, strict=strict) def _prepare_for_export(self, **kwargs): """ Override this method to prepare module for export. This is in-place operation. Base version does common necessary module replacements (Apex etc) """ if self.generator is not None: self.generator.remove_weight_norm() def input_example(self): """ Generates input examples for tracing etc. Returns: A tuple of input examples. """ par = next(self.parameters()) mel = torch.randn((1, self.cfg['preprocessor']['nfilt'], 96), device=par.device, dtype=par.dtype) return ({'spec': mel},) def forward_for_export(self, spec): """ Runs the generator, for inputs and outputs see input_types, and output_types """ return self.generator(x=spec)
def input_types(self): return { "audio": NeuralType(('B', 'T'), AudioSignal()), "audio_len": NeuralType(('B'), LengthsType()), "run_inverse": NeuralType(optional=True), }
class GlowVocoder(Vocoder): """ Base class for all Vocoders that use a Glow or reversible Flow-based setup. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._mode = OperationMode.infer self.bias_spect = None self.stft = None # Required to be defined in children classes self.n_mel = None # Required to be defined in children classes @property def mode(self): return self._mode @contextmanager def temp_mode(self, mode): old_mode = self.mode self.mode = mode try: yield finally: self.mode = old_mode @contextmanager def nemo_infer(self): # Prepend with nemo to avoid any .infer() clashes with lightning or pytorch with ExitStack() as stack: stack.enter_context(self.temp_mode(OperationMode.infer)) stack.enter_context(torch.no_grad()) yield def check_children_attributes(self): if self.stft is None or self.n_mel is None: try: self.stft = self.audio_to_melspec_precessor.stft self.n_mel = self.audio_to_melspec_precessor.nfilt except AttributeError: raise AttributeError( f"{self} did not have stft and n_mel defined. These two parameters are required for GlowVocoder's " "methods to work" ) def update_bias_spect(self): self.check_children_attributes() # Ensure self.n_mel and self.stft are defined with self.nemo_infer(): spect = torch.zeros((1, self.n_mel, 88)).to(self.device) bias_audio = self.convert_spectrogram_to_audio(spec=spect, sigma=0.0, denoise=False) bias_spect, _ = self.stft.transform(bias_audio) self.bias_spect = bias_spect[:, :, 0][:, :, None] @typecheck( input_types={"audio": NeuralType(('B', 'T'), AudioSignal()), "strength": NeuralType(optional=True)}, output_types={"audio": NeuralType(('B', 'T'), AudioSignal())}, ) def denoise(self, audio: 'torch.tensor', strength: float = 0.01): self.check_children_attributes() # Ensure self.n_mel and self.stft are defined if self.bias_spect is None: self.update_bias_spect() audio_spect, audio_angles = self.stft.transform(audio) audio_spect_denoised = audio_spect - self.bias_spect.to(audio.device) * strength audio_spect_denoised = torch.clamp(audio_spect_denoised, 0.0) audio_denoised = self.stft.inverse(audio_spect_denoised, audio_angles) return audio_denoised
def output_types(self): return { "spec": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), }
class FastSpeech2HifiGanE2EModel(TextToWaveform): """An end-to-end speech synthesis model based on FastSpeech2 and HiFiGan that converts strings to audio without using the intermediate mel spectrogram representation.""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) self.encoder = instantiate(cfg.encoder) self.variance_adapter = instantiate(cfg.variance_adaptor) self.generator = instantiate(cfg.generator) self.multiperioddisc = MultiPeriodDiscriminator() self.multiscaledisc = MultiScaleDiscriminator() self.melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.mel_val_loss = L1MelLoss() self.durationloss = DurationLoss() self.feat_matching_loss = FeatureMatchingLoss() self.disc_loss = DiscriminatorLoss() self.gen_loss = GeneratorLoss() self.mseloss = torch.nn.MSELoss() self.energy = cfg.add_energy_predictor self.pitch = cfg.add_pitch_predictor self.mel_loss_coeff = cfg.mel_loss_coeff self.pitch_loss_coeff = cfg.pitch_loss_coeff self.energy_loss_coeff = cfg.energy_loss_coeff self.splice_length = cfg.splice_length self.use_energy_pred = False self.use_pitch_pred = False self.log_train_images = False self.logged_real_samples = False self._tb_logger = None self.sample_rate = cfg.sample_rate self.hop_size = cfg.hop_size # Parser and mappings are used for inference only. self.parser = parsers.make_parser(name='en') if 'mappings_filepath' in cfg: mappings_filepath = cfg.get('mappings_filepath') else: logging.error( "ERROR: You must specify a mappings.json file in the config file under model.mappings_filepath." ) mappings_filepath = self.register_artifact('mappings_filepath', mappings_filepath) with open(mappings_filepath, 'r') as f: mappings = json.load(f) self.word2phones = mappings['word2phones'] self.phone2idx = mappings['phone2idx'] @property def tb_logger(self): if self._tb_logger is None: if self.logger is None and self.logger.experiment is None: return None tb_logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): tb_logger = logger.experiment break self._tb_logger = tb_logger return self._tb_logger def configure_optimizers(self): gen_params = chain( self.encoder.parameters(), self.generator.parameters(), self.variance_adapter.parameters(), ) disc_params = chain(self.multiscaledisc.parameters(), self.multiperioddisc.parameters()) opt1 = torch.optim.AdamW(disc_params, lr=self._cfg.lr) opt2 = torch.optim.AdamW(gen_params, lr=self._cfg.lr) num_procs = self._trainer.num_gpus * self._trainer.num_nodes num_samples = len(self._train_dl.dataset) batch_size = self._train_dl.batch_size iter_per_epoch = np.ceil(num_samples / (num_procs * batch_size)) max_steps = iter_per_epoch * self._trainer.max_epochs logging.info(f"MAX STEPS: {max_steps}") sch1 = NoamAnnealing(opt1, d_model=256, warmup_steps=3000, max_steps=max_steps, min_lr=1e-5) sch1_dict = { 'scheduler': sch1, 'interval': 'step', } sch2 = NoamAnnealing(opt2, d_model=256, warmup_steps=3000, max_steps=max_steps, min_lr=1e-5) sch2_dict = { 'scheduler': sch2, 'interval': 'step', } return [opt1, opt2], [sch1_dict, sch2_dict] @typecheck( input_types={ "text": NeuralType(('B', 'T'), TokenIndex()), "text_length": NeuralType(('B'), LengthsType()), "splice": NeuralType(optional=True), "spec_len": NeuralType(('B'), LengthsType(), optional=True), "durations": NeuralType(('B', 'T'), TokenDurationType(), optional=True), "pitch": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), "energies": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), }, output_types={ "audio": NeuralType(('B', 'S', 'T'), MelSpectrogramType()), "splices": NeuralType(), "log_dur_preds": NeuralType(('B', 'T'), TokenLogDurationType()), "pitch_preds": NeuralType(('B', 'T'), RegressionValuesType()), "energy_preds": NeuralType(('B', 'T'), RegressionValuesType()), "encoded_text_mask": NeuralType(('B', 'T', 'D'), MaskType()), }, ) def forward(self, *, text, text_length, splice=True, durations=None, pitch=None, energies=None, spec_len=None): encoded_text, encoded_text_mask = self.encoder(text=text, text_length=text_length) context, log_dur_preds, pitch_preds, energy_preds, spec_len = self.variance_adapter( x=encoded_text, x_len=text_length, dur_target=durations, pitch_target=pitch, energy_target=energies, spec_len=spec_len, ) gen_in = context splices = None if splice: # Splice generated spec output = [] splices = [] for i, sample in enumerate(context): start = np.random.randint( low=0, high=min(int(sample.size(0)), int(spec_len[i])) - self.splice_length) output.append(sample[start:start + self.splice_length, :]) splices.append(start) gen_in = torch.stack(output) output = self.generator(x=gen_in.transpose(1, 2)) return output, splices, log_dur_preds, pitch_preds, energy_preds, encoded_text_mask def training_step(self, batch, batch_idx, optimizer_idx): f, fl, t, tl, durations, pitch, energies = batch spec, spec_len = self.audio_to_melspec_precessor(f, fl) # train discriminator if optimizer_idx == 0: with torch.no_grad(): audio_pred, splices, _, _, _, _ = self( spec=spec, spec_len=spec_len, text=t, text_length=tl, durations=durations, pitch=pitch if not self.use_pitch_pred else None, energies=energies if not self.use_energy_pred else None, ) real_audio = [] for i, splice in enumerate(splices): real_audio.append( f[i, splice * self.hop_size:(splice + self.splice_length) * self.hop_size]) real_audio = torch.stack(real_audio).unsqueeze(1) real_score_mp, gen_score_mp, _, _ = self.multiperioddisc( real_audio, audio_pred) real_score_ms, gen_score_ms, _, _ = self.multiscaledisc( real_audio, audio_pred) loss_mp, loss_mp_real, _ = self.disc_loss(real_score_mp, gen_score_mp) loss_ms, loss_ms_real, _ = self.disc_loss(real_score_ms, gen_score_ms) loss_mp /= len(loss_mp_real) loss_ms /= len(loss_ms_real) loss_disc = loss_mp + loss_ms self.log("loss_discriminator", loss_disc, prog_bar=True) self.log("loss_discriminator_ms", loss_ms) self.log("loss_discriminator_mp", loss_mp) return loss_disc # train generator elif optimizer_idx == 1: audio_pred, splices, log_dur_preds, pitch_preds, energy_preds, encoded_text_mask = self( spec=spec, spec_len=spec_len, text=t, text_length=tl, durations=durations, pitch=pitch if not self.use_pitch_pred else None, energies=energies if not self.use_energy_pred else None, ) real_audio = [] for i, splice in enumerate(splices): real_audio.append( f[i, splice * self.hop_size:(splice + self.splice_length) * self.hop_size]) real_audio = torch.stack(real_audio).unsqueeze(1) # Do HiFiGAN generator loss audio_length = torch.tensor([ self.splice_length * self.hop_size for _ in range(real_audio.shape[0]) ]).to(real_audio.device) real_spliced_spec, _ = self.melspec_fn(real_audio.squeeze(), seq_len=audio_length) pred_spliced_spec, _ = self.melspec_fn(audio_pred.squeeze(), seq_len=audio_length) loss_mel = torch.nn.functional.l1_loss(real_spliced_spec, pred_spliced_spec) loss_mel *= self.mel_loss_coeff _, gen_score_mp, real_feat_mp, gen_feat_mp = self.multiperioddisc( real_audio, audio_pred) _, gen_score_ms, real_feat_ms, gen_feat_ms = self.multiscaledisc( real_audio, audio_pred) loss_gen_mp, list_loss_gen_mp = self.gen_loss(gen_score_mp) loss_gen_ms, list_loss_gen_ms = self.gen_loss(gen_score_ms) loss_gen_mp /= len(list_loss_gen_mp) loss_gen_ms /= len(list_loss_gen_ms) total_loss = loss_gen_mp + loss_gen_ms + loss_mel loss_feat_mp = self.feat_matching_loss(real_feat_mp, gen_feat_mp) loss_feat_ms = self.feat_matching_loss(real_feat_ms, gen_feat_ms) total_loss += loss_feat_mp + loss_feat_ms self.log(name="loss_gen_disc_feat", value=loss_feat_mp + loss_feat_ms) self.log(name="loss_gen_disc_feat_ms", value=loss_feat_ms) self.log(name="loss_gen_disc_feat_mp", value=loss_feat_mp) self.log(name="loss_gen_mel", value=loss_mel) self.log(name="loss_gen_disc", value=loss_gen_mp + loss_gen_ms) self.log(name="loss_gen_disc_mp", value=loss_gen_mp) self.log(name="loss_gen_disc_ms", value=loss_gen_ms) dur_loss = self.durationloss(log_duration_pred=log_dur_preds, duration_target=durations.float(), mask=encoded_text_mask) self.log(name="loss_gen_duration", value=dur_loss) total_loss += dur_loss if self.pitch: pitch_loss = self.mseloss( pitch_preds, pitch.float()) * self.pitch_loss_coeff total_loss += pitch_loss self.log(name="loss_gen_pitch", value=pitch_loss) if self.energy: energy_loss = self.mseloss(energy_preds, energies) * self.energy_loss_coeff total_loss += energy_loss self.log(name="loss_gen_energy", value=energy_loss) # Log images to tensorboard if self.log_train_images: self.log_train_images = False if self.logger is not None and self.logger.experiment is not None: self.tb_logger.add_image( "train_mel_target", plot_spectrogram_to_numpy( real_spliced_spec[0].data.cpu().numpy()), self.global_step, dataformats="HWC", ) spec_predict = pred_spliced_spec[0].data.cpu().numpy() self.tb_logger.add_image( "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC", ) self.log(name="loss_gen", prog_bar=True, value=total_loss) return total_loss def validation_step(self, batch, batch_idx): f, fl, t, tl, _, _, _ = batch spec, spec_len = self.audio_to_melspec_precessor(f, fl) audio_pred, _, _, _, _, _ = self(spec=spec, spec_len=spec_len, text=t, text_length=tl, splice=False) audio_pred.squeeze_() pred_spec, _ = self.melspec_fn(audio_pred, seq_len=spec_len) loss = self.mel_val_loss(spec_pred=pred_spec, spec_target=spec, spec_target_len=spec_len, pad_value=-11.52) return { "val_loss": loss, "audio_target": f.squeeze() if batch_idx == 0 else None, "audio_pred": audio_pred if batch_idx == 0 else None, } def on_train_epoch_start(self): # Switch to using energy predictions after 50% of training if not self.use_energy_pred and self.current_epoch >= np.ceil( 0.5 * self._trainer.max_epochs): logging.info( f"Using energy predictions after epoch: {self.current_epoch}") self.use_energy_pred = True # Switch to using pitch predictions after 62.5% of training if not self.use_pitch_pred and self.current_epoch >= np.ceil( 0.625 * self._trainer.max_epochs): logging.info( f"Using pitch predictions after epoch: {self.current_epoch}") self.use_pitch_pred = True def validation_epoch_end(self, outputs): if self.tb_logger is not None: _, audio_target, audio_predict = outputs[0].values() if not self.logged_real_samples: self.tb_logger.add_audio("val_target", audio_target[0].data.cpu(), self.global_step, self.sample_rate) self.logged_real_samples = True audio_predict = audio_predict[0].data.cpu() self.tb_logger.add_audio("val_pred", audio_predict, self.global_step, self.sample_rate) avg_loss = torch.stack([ x['val_loss'] for x in outputs ]).mean() # This reduces across batches, not workers! self.log('val_loss', avg_loss, sync_dist=True) self.log_train_images = True def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance( cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True") with open_dict(cfg.dataloader_params): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to False!!!" ) elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate(cfg.dataset) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config( cfg, shuffle_should_be=False, name="validation") def parse(self, str_input: str, additional_word2phones=None) -> torch.tensor: """ Parses text input and converts them to phoneme indices. str_input (str): The input text to be converted. additional_word2phones (dict): Optional dictionary mapping words to phonemes for updating the model's word2phones. This will not overwrite the existing dictionary, just update it with OOV or new mappings. Defaults to None, which will keep the existing mapping. """ # Update model's word2phones if applicable if additional_word2phones is not None: self.word2phones.update(additional_word2phones) # Convert text -> normalized text -> list of phones per word -> indices if str_input[-1] not in [".", "!", "?"]: str_input = str_input + "." norm_text = re.findall(r"""[\w']+|[.,!?;"]""", self.parser._normalize(str_input)) try: phones = [self.word2phones[t] for t in norm_text] except KeyError as error: logging.error( f"ERROR: The following word in the input is not in the model's dictionary and could not be converted" f" to phonemes: ({error}).\n" f"You can pass in an `additional_word2phones` dictionary with a conversion for" f" this word, e.g. {{'{error}': \['phone1', 'phone2', ...\]}} to update the model's mapping." ) raise tokens = [] for phone_list in phones: inds = [self.phone2idx[p] for p in phone_list] tokens += inds x = torch.tensor(tokens).unsqueeze_(0).long().to(self.device) return x def convert_text_to_waveform(self, *, tokens): """ Accepts tokens returned from self.parse() and returns a list of tensors. Note: The tensors in the list can have different lengths. """ self.eval() token_len = torch.tensor([len(i) for i in tokens]).to(self.device) audio, _, log_dur_pred, _, _, _ = self(text=tokens, text_length=token_len, splice=False) audio = audio.squeeze(1) durations = torch.sum(torch.exp(log_dur_pred) - 1, 1).to(torch.int) audio_list = [] for i, sample in enumerate(audio): audio_list.append(sample[:durations[i] * self.hop_size]) return audio_list @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_en_e2e_fastspeech2hifigan", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_e2e_fastspeech2hifigan/versions/1.0.0/files/tts_en_e2e_fastspeech2hifigan.nemo", description= "This model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent.", class_=cls, ) list_of_models.append(model) return list_of_models
def input_types(self): return { "mel_spec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), }
def input_types(self): return { "log_durs_predicted": NeuralType(('B', 'T'), TokenLogDurationType()), "durs_tgt": NeuralType(('B', 'T'), TokenDurationType()), "len": NeuralType(('B'), LengthsType()), }
def input_types(self): return { "token_embedding": NeuralType(('B', 'D', 'T'), EmbeddedTextType()), "token_len": NeuralType(('B'), LengthsType()), }
def input_types(self): return { "pitch_predicted": NeuralType(('B', 'T'), RegressionValuesType()), "pitch_tgt": NeuralType(('B', 'T'), RegressionValuesType()), "len": NeuralType(('B'), LengthsType()), }
def output_types(self): return { "encoder_embedding": NeuralType(('B', 'T', 'D'), EmbeddedTextType()), }
class UniGlowModel(Vocoder): """Waveglow model used to convert betweeen spectrograms and audio""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(WaveglowConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.sigma = self._cfg.sigma self.audio_to_melspec_precessor = instantiate(self._cfg.preprocessor) self.model = UniGlowModule( self._cfg.uniglow.n_mel_channels, self._cfg.uniglow.n_flows, self._cfg.uniglow.n_group, self._cfg.uniglow.n_wn_channels, self._cfg.uniglow.n_wn_layers, self._cfg.uniglow.wn_kernel_size, self.get_upsample_factor(), ) self.mode = OperationMode.infer self.loss = UniGlowLoss(self._cfg.uniglow.stft_loss_coef) self.removed_weightnorm = False @property def mode(self): return self._mode @mode.setter def mode(self, new_mode): if new_mode == OperationMode.training: self.train() else: self.eval() self._mode = new_mode self.model.mode = new_mode @property def input_types(self): return { "audio": NeuralType(('B', 'T'), AudioSignal()), "audio_len": NeuralType(('B'), LengthsType()), } @property def output_types(self): if self.mode == OperationMode.training or self.mode == OperationMode.validation: output_dict = { "pred_normal_dist": NeuralType(('B', 'flowgroup', 'T'), NormalDistributionSamplesType()), "logdet": NeuralType(elements_type=LogDeterminantType()), "predicted_audio": NeuralType(('B', 'T'), AudioSignal()), } if self.mode == OperationMode.validation: output_dict["spec"] = NeuralType(('B', 'T', 'D'), MelSpectrogramType()) output_dict["spec_len"] = NeuralType(('B'), LengthsType()) return output_dict return { "audio_pred": NeuralType(('B', 'T'), AudioSignal()), } @typecheck() def forward(self, *, audio, audio_len): if self.mode != self.model.mode: raise ValueError( f"WaveGlowModel's mode {self.mode} does not match WaveGlowModule's mode {self.model.mode}" ) spec, spec_len = self.audio_to_melspec_precessor(audio, audio_len) tensors = self.model(spec=spec, audio=audio, sigma=self.sigma) if self.mode == OperationMode.training: return tensors # z, logdet, audio_pred elif self.mode == OperationMode.validation: z, logdet, audio_pred = tensors return z, logdet, audio_pred, spec, spec_len return tensors @typecheck( input_types={ "spec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "sigma": NeuralType(optional=True) }, output_types={"audio": NeuralType(('B', 'T'), AudioSignal())}, ) def convert_spectrogram_to_audio(self, spec: torch.Tensor, sigma: float = 1.0) -> torch.Tensor: if not self.removed_weightnorm: self.waveglow.remove_weightnorm() self.removed_weightnorm = True self.mode = OperationMode.infer with torch.no_grad(): audio = self.model(spec=spec, audio=None, sigma=sigma) return audio def training_step(self, batch, batch_idx): self.mode = OperationMode.training audio, audio_len = batch z, logdet, predicted_audio = self(audio=audio, audio_len=audio_len) loss = self.loss(z=z, logdet=logdet, gt_audio=audio, predicted_audio=predicted_audio, sigma=self.sigma) output = { 'loss': loss, 'progress_bar': { 'training_loss': loss }, 'log': { 'loss': loss }, } return output def validation_step(self, batch, batch_idx): self.mode = OperationMode.validation audio, audio_len = batch z, logdet, predicted_audio, spec, spec_len = self(audio=audio, audio_len=audio_len) loss = self.loss(z=z, logdet=logdet, gt_audio=audio, predicted_audio=predicted_audio, sigma=self.sigma) # compute average stoi score for batch stoi_score = 0 sr = self._cfg.preprocessor.sample_rate for audio_i, audio_recon_i in zip(audio.cpu(), predicted_audio.cpu()): stoi_score += stoi(audio_i, audio_recon_i, sr) stoi_score /= audio.shape[0] return { "val_loss": loss, "predicted_audio": predicted_audio, "mel_target": spec, "mel_len": spec_len, "stoi": stoi_score, } def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: tb_logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): tb_logger = logger.experiment break waveglow_log_to_tb_func( tb_logger, tuple(outputs[0].values())[:-1], self.global_step, tag="eval", mel_fb=self.audio_to_melspec_precessor.fb, ) avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() avg_stoi = torch.FloatTensor([x['stoi'] for x in outputs]).mean() tensorboard_logs = {'val_loss': avg_loss, 'stoi': avg_stoi} logging.info( f"Validation summary | Epoch {self.current_epoch} | NLL {avg_loss:.2f} | STOI: {avg_stoi:.2f}" ) return {'val_loss': avg_loss, 'log': tensorboard_logs} def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance( cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True") with open_dict(cfg["dataloader_params"]): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to False!!!" ) elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate(cfg.dataset) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config( cfg, shuffle_should_be=False, name="validation") @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="UniGlow-22050Hz", location= "https://drive.google.com/file/d/18JO5heoz1pBicZnGGqJzAJYMpzxiDQDa/view?usp=sharing", description= "The model is trained on LJSpeech sampled at 22050Hz, and can be used as an universal vocoder", ) list_of_models.append(model) return list_of_models def get_upsample_factor(self) -> int: """ As the MelSpectrogram upsampling is done using interpolation, the upsampling factor is determined by the ratio of the MelSpectrogram length and the waveform length Returns: An integer representing the upsampling factor """ audio = torch.ones(1, self._cfg.train_ds.dataset.n_segments) spec, spec_len = self.audio_to_melspec_precessor( audio, torch.FloatTensor([len(audio)])) spec = spec[:, :, :-1] audio = audio.unfold(1, self._cfg.uniglow.n_group, self._cfg.uniglow.n_group).permute(0, 2, 1) upsample_factor = audio.shape[2] // spec.shape[2] return upsample_factor
def output_types(self): if not self.calculate_loss and not self.training: return { "spec_pred_dec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spec_pred_postnet": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "gate_pred": NeuralType(('B', 'T'), LogitsType()), "alignments": NeuralType(('B', 'T', 'T'), SequenceToSequenceAlignmentType()), "pred_length": NeuralType(('B'), LengthsType()), } return { "spec_pred_dec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spec_pred_postnet": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "gate_pred": NeuralType(('B', 'T'), LogitsType()), "spec_target": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spec_target_len": NeuralType(('B'), LengthsType()), "alignments": NeuralType(('B', 'T', 'T'), SequenceToSequenceAlignmentType()), }
def input_types(self): return { "decoder_input": NeuralType(('B', 'T', 'D'), EncodedRepresentation()), "lengths": NeuralType(('B'), LengthsType()), }
class Tacotron2Model(SpectrogramGenerator): """Tacotron 2 Model that is used to generate mel spectrograms from text""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) # setup tokenizer self.tokenizer = None if hasattr(cfg, 'text_tokenizer'): self._setup_tokenizer(cfg) self.num_tokens = len(self.tokenizer.tokens) self.tokenizer_pad = self.tokenizer.pad self.tokenizer_unk = self.tokenizer.oov # assert self.tokenizer is not None else: self.num_tokens = len(cfg.labels) + 3 super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(Tacotron2Config) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema try: OmegaConf.merge(cfg, schema) self.pad_value = cfg.preprocessor.pad_value except ConfigAttributeError: self.pad_value = cfg.preprocessor.params.pad_value logging.warning( "Your config is using an old NeMo yaml configuration. Please ensure that the yaml matches the " "current version in the main branch for future compatibility.") self._parser = None self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) self.text_embedding = nn.Embedding(self.num_tokens, 512) self.encoder = instantiate(self._cfg.encoder) self.decoder = instantiate(self._cfg.decoder) self.postnet = instantiate(self._cfg.postnet) self.loss = Tacotron2Loss() self.calculate_loss = True @property def parser(self): if self._parser is not None: return self._parser ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1] if ds_class_name == "TTSDataset": self._parser = None elif hasattr(self._cfg, "labels"): self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) elif ds_class_name == "AudioToCharWithPriorAndPitchDataset": self.parser = self.vocab.encode else: raise ValueError( "Wanted to setup parser, but model does not have necessary paramaters" ) return self._parser def parse(self, text: str, normalize=True) -> torch.Tensor: if self.training: logging.warning("parse() is meant to be called in eval mode.") if normalize and self.text_normalizer_call is not None: text = self.text_normalizer_call( text, **self.text_normalizer_call_kwargs) eval_phon_mode = contextlib.nullcontext() if hasattr(self.tokenizer, "set_phone_prob"): eval_phon_mode = self.tokenizer.set_phone_prob(prob=1.0) with eval_phon_mode: if self.tokenizer is not None: tokens = self.tokenizer.encode(text) else: tokens = self.parser(text) # Old parser doesn't add bos and eos ids, so maunally add it tokens = [len(self._cfg.labels) ] + tokens + [len(self._cfg.labels) + 1] tokens_tensor = torch.tensor(tokens).unsqueeze_(0).to(self.device) return tokens_tensor @property def input_types(self): if self.training: return { "tokens": NeuralType(('B', 'T'), EmbeddedTextType()), "token_len": NeuralType(('B'), LengthsType()), "audio": NeuralType(('B', 'T'), AudioSignal()), "audio_len": NeuralType(('B'), LengthsType()), } else: return { "tokens": NeuralType(('B', 'T'), EmbeddedTextType()), "token_len": NeuralType(('B'), LengthsType()), "audio": NeuralType(('B', 'T'), AudioSignal(), optional=True), "audio_len": NeuralType(('B'), LengthsType(), optional=True), } @property def output_types(self): if not self.calculate_loss and not self.training: return { "spec_pred_dec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spec_pred_postnet": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "gate_pred": NeuralType(('B', 'T'), LogitsType()), "alignments": NeuralType(('B', 'T', 'T'), SequenceToSequenceAlignmentType()), "pred_length": NeuralType(('B'), LengthsType()), } return { "spec_pred_dec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spec_pred_postnet": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "gate_pred": NeuralType(('B', 'T'), LogitsType()), "spec_target": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), "spec_target_len": NeuralType(('B'), LengthsType()), "alignments": NeuralType(('B', 'T', 'T'), SequenceToSequenceAlignmentType()), } @typecheck() def forward(self, *, tokens, token_len, audio=None, audio_len=None): if audio is not None and audio_len is not None: spec_target, spec_target_len = self.audio_to_melspec_precessor( audio, audio_len) token_embedding = self.text_embedding(tokens).transpose(1, 2) encoder_embedding = self.encoder(token_embedding=token_embedding, token_len=token_len) if self.training: spec_pred_dec, gate_pred, alignments = self.decoder( memory=encoder_embedding, decoder_inputs=spec_target, memory_lengths=token_len) else: spec_pred_dec, gate_pred, alignments, pred_length = self.decoder( memory=encoder_embedding, memory_lengths=token_len) spec_pred_postnet = self.postnet(mel_spec=spec_pred_dec) if not self.calculate_loss: return spec_pred_dec, spec_pred_postnet, gate_pred, alignments, pred_length return spec_pred_dec, spec_pred_postnet, gate_pred, spec_target, spec_target_len, alignments @typecheck( input_types={"tokens": NeuralType(('B', 'T'), EmbeddedTextType())}, output_types={ "spec": NeuralType(('B', 'D', 'T'), MelSpectrogramType()) }, ) def generate_spectrogram(self, *, tokens): self.eval() self.calculate_loss = False token_len = torch.tensor([len(i) for i in tokens]).to(self.device) tensors = self(tokens=tokens, token_len=token_len) spectrogram_pred = tensors[1] if spectrogram_pred.shape[0] > 1: # Silence all frames past the predicted end mask = ~get_mask_from_lengths(tensors[-1]) mask = mask.expand(spectrogram_pred.shape[1], mask.size(0), mask.size(1)) mask = mask.permute(1, 0, 2) spectrogram_pred.data.masked_fill_(mask, self.pad_value) return spectrogram_pred def training_step(self, batch, batch_idx): audio, audio_len, tokens, token_len = batch spec_pred_dec, spec_pred_postnet, gate_pred, spec_target, spec_target_len, _ = self.forward( audio=audio, audio_len=audio_len, tokens=tokens, token_len=token_len) loss, _ = self.loss( spec_pred_dec=spec_pred_dec, spec_pred_postnet=spec_pred_postnet, gate_pred=gate_pred, spec_target=spec_target, spec_target_len=spec_target_len, pad_value=self.pad_value, ) output = { 'loss': loss, 'progress_bar': { 'training_loss': loss }, 'log': { 'loss': loss }, } return output def validation_step(self, batch, batch_idx): audio, audio_len, tokens, token_len = batch spec_pred_dec, spec_pred_postnet, gate_pred, spec_target, spec_target_len, alignments = self.forward( audio=audio, audio_len=audio_len, tokens=tokens, token_len=token_len) loss, gate_target = self.loss( spec_pred_dec=spec_pred_dec, spec_pred_postnet=spec_pred_postnet, gate_pred=gate_pred, spec_target=spec_target, spec_target_len=spec_target_len, pad_value=self.pad_value, ) return { "val_loss": loss, "mel_target": spec_target, "mel_postnet": spec_pred_postnet, "gate": gate_pred, "gate_target": gate_target, "alignments": alignments, } def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): logger = logger.experiment break if isinstance(logger, TensorBoardLogger): tacotron2_log_to_tb_func( logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, ) elif isinstance(logger, WandbLogger): tacotron2_log_to_wandb_func( logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, ) avg_loss = torch.stack([ x['val_loss'] for x in outputs ]).mean() # This reduces across batches, not workers! self.log('val_loss', avg_loss) def _setup_normalizer(self, cfg): if "text_normalizer" in cfg: normalizer_kwargs = {} if "whitelist" in cfg.text_normalizer: normalizer_kwargs["whitelist"] = self.register_artifact( 'text_normalizer.whitelist', cfg.text_normalizer.whitelist) self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) self.text_normalizer_call = self.normalizer.normalize if "text_normalizer_call_kwargs" in cfg: self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None: g2p_kwargs = {} if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, ) text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) self.tokenizer = instantiate(cfg.text_tokenizer, **text_tokenizer_kwargs) def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") if "dataloader_params" not in cfg or not isinstance( cfg.dataloader_params, DictConfig): raise ValueError(f"No dataloder_params for {name}") if shuffle_should_be: if 'shuffle' not in cfg.dataloader_params: logging.warning( f"Shuffle should be set to True for {self}'s {name} dataloader but was not found in its " "config. Manually setting to True") with open_dict(cfg.dataloader_params): cfg.dataloader_params.shuffle = True elif not cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to False!!!" ) elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error( f"The {name} dataloader for {self} has shuffle set to True!!!") dataset = instantiate( cfg.dataset, text_normalizer=self.normalizer, text_normalizer_call_kwargs=self.text_normalizer_call_kwargs, text_tokenizer=self.tokenizer, ) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): self._validation_dl = self.__setup_dataloader_from_config( cfg, shuffle_should_be=False, name="validation") @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_en_tacotron2", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.0.0/files/tts_en_tacotron2.nemo", description= "This model is trained on LJSpeech sampled at 22050Hz, and can be used to generate female English voices with an American accent.", class_=cls, aliases=["Tacotron2-22050Hz"], ) list_of_models.append(model) return list_of_models
def input_types(self): # phonemes return { "text": NeuralType(('B', 'T'), TokenIndex()), "text_length": NeuralType(('B'), LengthsType()) }
class FastPitchHifiGanE2EModel(TextToWaveform): """An end-to-end speech synthesis model based on FastPitch and HiFiGan that converts strings to audio without using the intermediate mel spectrogram representation. """ def __init__(self, cfg: DictConfig, trainer: Trainer = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) self._parser = parsers.make_parser( labels=cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastPitchHifiGanE2EConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.preprocessor = instantiate(cfg.preprocessor) self.melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.encoder = instantiate(cfg.input_fft) self.duration_predictor = instantiate(cfg.duration_predictor) self.pitch_predictor = instantiate(cfg.pitch_predictor) self.generator = instantiate(cfg.generator) self.multiperioddisc = MultiPeriodDiscriminator() self.multiscaledisc = MultiScaleDiscriminator() self.mel_val_loss = L1MelLoss() self.feat_matching_loss = FeatureMatchingLoss() self.disc_loss = DiscriminatorLoss() self.gen_loss = GeneratorLoss() self.max_token_duration = cfg.max_token_duration self.pitch_emb = torch.nn.Conv1d( 1, cfg.symbols_embedding_dim, kernel_size=cfg.pitch_embedding_kernel_size, padding=int((cfg.pitch_embedding_kernel_size - 1) / 2), ) # Store values precomputed from training data for convenience self.register_buffer('pitch_mean', torch.zeros(1)) self.register_buffer('pitch_std', torch.zeros(1)) self.loss = BaseFastPitchLoss() self.mel_loss_coeff = cfg.mel_loss_coeff self.log_train_images = False self.logged_real_samples = False self._tb_logger = None self.hann_window = None self.splice_length = cfg.splice_length self.sample_rate = cfg.sample_rate self.hop_size = cfg.hop_size @property def tb_logger(self): if self._tb_logger is None: if self.logger is None and self.logger.experiment is None: return None tb_logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): tb_logger = logger.experiment break self._tb_logger = tb_logger return self._tb_logger @property def parser(self): if self._parser is not None: return self._parser self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) return self._parser def parse(self, str_input: str) -> torch.tensor: if str_input[-1] not in [".", "!", "?"]: str_input = str_input + "." tokens = self.parser(str_input) x = torch.tensor(tokens).unsqueeze_(0).long().to(self.device) return x def configure_optimizers(self): gen_params = chain( self.pitch_emb.parameters(), self.encoder.parameters(), self.duration_predictor.parameters(), self.pitch_predictor.parameters(), self.generator.parameters(), ) disc_params = chain(self.multiscaledisc.parameters(), self.multiperioddisc.parameters()) opt1 = torch.optim.AdamW(disc_params, lr=self._cfg.lr) opt2 = torch.optim.AdamW(gen_params, lr=self._cfg.lr) num_procs = self._trainer.num_gpus * self._trainer.num_nodes num_samples = len(self._train_dl.dataset) batch_size = self._train_dl.batch_size iter_per_epoch = np.ceil(num_samples / (num_procs * batch_size)) max_steps = iter_per_epoch * self._trainer.max_epochs logging.info(f"MAX STEPS: {max_steps}") sch1 = NoamAnnealing(opt1, d_model=1, warmup_steps=1000, max_steps=max_steps, last_epoch=-1) sch1_dict = { 'scheduler': sch1, 'interval': 'step', } sch2 = NoamAnnealing(opt2, d_model=1, warmup_steps=1000, max_steps=max_steps, last_epoch=-1) sch2_dict = { 'scheduler': sch2, 'interval': 'step', } return [opt1, opt2], [sch1_dict, sch2_dict] @typecheck( input_types={ "text": NeuralType(('B', 'T'), TokenIndex()), "durs": NeuralType(('B', 'T'), TokenDurationType(), optional=True), "pitch": NeuralType(('B', 'T'), RegressionValuesType(), optional=True), "pace": NeuralType(optional=True), "splice": NeuralType(optional=True), }, output_types={ "audio": NeuralType(('B', 'T'), MelSpectrogramType()), "splices": NeuralType(), "log_dur_preds": NeuralType(('B', 'T'), TokenLogDurationType()), "pitch_preds": NeuralType(('B', 'T'), RegressionValuesType()), }, ) def forward(self, *, text, durs=None, pitch=None, pace=1.0, splice=True): if self.training: assert durs is not None assert pitch is not None # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=0) # Embedded for predictors pred_enc_out, pred_enc_mask = enc_out, enc_mask # Predict durations log_durs_predicted = self.duration_predictor(pred_enc_out, pred_enc_mask) durs_predicted = torch.clamp( torch.exp(log_durs_predicted) - 1, 0, self.max_token_duration) # Predict pitch pitch_predicted = self.pitch_predictor(enc_out, enc_mask) if pitch is None: pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1)) else: pitch_emb = self.pitch_emb(pitch.unsqueeze(1)) enc_out = enc_out + pitch_emb.transpose(1, 2) if durs is None: len_regulated, dec_lens = regulate_len(durs_predicted, enc_out, pace) else: len_regulated, dec_lens = regulate_len(durs, enc_out, pace) gen_in = len_regulated splices = None if splice: output = [] splices = [] for i, sample in enumerate(len_regulated): start = np.random.randint( low=0, high=min(int(sample.size(0)), int(dec_lens[i])) - self.splice_length) # Splice generated spec output.append(sample[start:start + self.splice_length, :]) splices.append(start) gen_in = torch.stack(output) output = self.generator(gen_in.transpose(1, 2)) return output, splices, log_durs_predicted, pitch_predicted def training_step(self, batch, batch_idx, optimizer_idx): audio, _, text, text_lens, durs, pitch, _ = batch # train discriminator if optimizer_idx == 0: with torch.no_grad(): audio_pred, splices, _, _ = self(text=text, durs=durs, pitch=pitch) real_audio = [] for i, splice in enumerate(splices): real_audio.append( audio[i, splice * self.hop_size:(splice + self.splice_length) * self.hop_size]) real_audio = torch.stack(real_audio).unsqueeze(1) real_score_mp, gen_score_mp, _, _ = self.multiperioddisc( real_audio, audio_pred) real_score_ms, gen_score_ms, _, _ = self.multiscaledisc( real_audio, audio_pred) loss_mp, loss_mp_real, _ = self.disc_loss(real_score_mp, gen_score_mp) loss_ms, loss_ms_real, _ = self.disc_loss(real_score_ms, gen_score_ms) loss_mp /= len(loss_mp_real) loss_ms /= len(loss_ms_real) loss_disc = loss_mp + loss_ms self.log("loss_discriminator", loss_disc, prog_bar=True) self.log("loss_discriminator_ms", loss_ms) self.log("loss_discriminator_mp", loss_mp) return loss_disc # train generator elif optimizer_idx == 1: audio_pred, splices, log_dur_preds, pitch_preds = self(text=text, durs=durs, pitch=pitch) real_audio = [] for i, splice in enumerate(splices): real_audio.append( audio[i, splice * self.hop_size:(splice + self.splice_length) * self.hop_size]) real_audio = torch.stack(real_audio).unsqueeze(1) _, dur_loss, pitch_loss = self.loss( log_durs_predicted=log_dur_preds, pitch_predicted=pitch_preds, durs_tgt=durs, dur_lens=text_lens, pitch_tgt=pitch, ) # Do HiFiGAN generator loss audio_length = torch.tensor([ self.splice_length * self.hop_size for _ in range(real_audio.shape[0]) ]).to(real_audio.device) real_spliced_spec, _ = self.melspec_fn(real_audio.squeeze(), audio_length) pred_spliced_spec, _ = self.melspec_fn(audio_pred.squeeze(), audio_length) loss_mel = torch.nn.functional.l1_loss(real_spliced_spec, pred_spliced_spec) loss_mel *= self.mel_loss_coeff _, gen_score_mp, _, _ = self.multiperioddisc( real_audio, audio_pred) _, gen_score_ms, _, _ = self.multiscaledisc(real_audio, audio_pred) loss_gen_mp, list_loss_gen_mp = self.gen_loss(gen_score_mp) loss_gen_ms, list_loss_gen_ms = self.gen_loss(gen_score_ms) loss_gen_mp /= len(list_loss_gen_mp) loss_gen_ms /= len(list_loss_gen_ms) total_loss = loss_gen_mp + loss_gen_ms + loss_mel total_loss += dur_loss total_loss += pitch_loss self.log(name="loss_gen_mel", value=loss_mel) self.log(name="loss_gen_disc", value=loss_gen_mp + loss_gen_ms) self.log(name="loss_gen_disc_mp", value=loss_gen_mp) self.log(name="loss_gen_disc_ms", value=loss_gen_ms) self.log(name="loss_gen_duration", value=dur_loss) self.log(name="loss_gen_pitch", value=pitch_loss) # Log images to tensorboard if self.log_train_images: self.log_train_images = False if self.logger is not None and self.logger.experiment is not None: self.tb_logger.add_image( "train_mel_target", plot_spectrogram_to_numpy( real_spliced_spec[0].data.cpu().numpy()), self.global_step, dataformats="HWC", ) spec_predict = pred_spliced_spec[0].data.cpu().numpy() self.tb_logger.add_image( "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC", ) self.log(name="loss_gen", prog_bar=True, value=total_loss) return total_loss def validation_step(self, batch, batch_idx): audio, audio_lens, text, _, _, _, _ = batch mels, mel_lens = self.preprocessor(audio, audio_lens) audio_pred, _, log_durs_predicted, _ = self(text=text, durs=None, pitch=None, splice=False) audio_length = torch.sum(torch.clamp(torch.exp(log_durs_predicted - 1), 0), axis=1) audio_pred.squeeze_() pred_spec, _ = self.melspec_fn(audio_pred, audio_length) loss = self.mel_val_loss(spec_pred=pred_spec, spec_target=mels, spec_target_len=mel_lens, pad_value=-11.52, transpose=False) return { "val_loss": loss, "audio_target": audio if batch_idx == 0 else None, "audio_pred": audio_pred.squeeze() if batch_idx == 0 else None, } def validation_epoch_end(self, outputs): if self.tb_logger is not None: _, audio_target, audio_predict = outputs[0].values() if not self.logged_real_samples: self.tb_logger.add_audio("val_target", audio_target[0].data.cpu(), self.global_step, self.sample_rate) self.logged_real_samples = True audio_predict = audio_predict[0].data.cpu() self.tb_logger.add_audio("val_pred", audio_predict, self.global_step, self.sample_rate) avg_loss = torch.stack([ x['val_loss'] for x in outputs ]).mean() # This reduces across batches, not workers! self.log('val_loss', avg_loss, sync_dist=True) self.log_train_images = True def _loader(self, cfg): dataset = FastPitchDataset( manifest_filepath=cfg['manifest_filepath'], parser=self.parser, sample_rate=cfg['sample_rate'], int_values=cfg.get('int_values', False), max_duration=cfg.get('max_duration', None), min_duration=cfg.get('min_duration', None), max_utts=cfg.get('max_utts', 0), trim=cfg.get('trim_silence', True), ) return torch.utils.data.DataLoader( dataset=dataset, batch_size=cfg['batch_size'], collate_fn=dataset.collate_fn, drop_last=cfg.get('drop_last', True), shuffle=cfg['shuffle'], num_workers=cfg.get('num_workers', 16), ) def setup_training_data(self, cfg): self._train_dl = self._loader(cfg) def setup_validation_data(self, cfg): self._validation_dl = self._loader(cfg) def setup_test_data(self, cfg): """Omitted.""" pass @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ list_of_models = [] # model = PretrainedModelInfo( # pretrained_model_name="", # location="", # description="", # class_=cls, # ) # list_of_models.append(model) return list_of_models def convert_text_to_waveform(self, *, tokens): """ Accepts tokens returned from self.parse() and returns a list of tensors. Note: The tensors in the list can have different lengths. """ self.eval() audio, _, log_dur_pred, _ = self(text=tokens, splice=False) audio = audio.squeeze() durations = torch.sum( torch.clamp( torch.exp(log_dur_pred) - 1, 0, self.max_token_duration), 1) audio_list = [] for i, sample in enumerate(audio): audio_list.append(sample[:durations[i] * self.hop_size]) return audio_list
def input_types(self): return { "x_mag": NeuralType(('B', 'T', 'D'), SpectrogramType()), "y_mag": NeuralType(('B', 'T', 'D'), SpectrogramType()), "input_lengths": NeuralType(('B'), LengthsType(), optional=True), }
def input_types(self): return { "y": NeuralType(('B', 'S', 'T'), AudioSignal()), "y_hat": NeuralType(('B', 'S', 'T'), AudioSignal()), }