def _load_tacotron2_model(self, model_dir): from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2 from PyTorch.SpeechSynthesis.Tacotron2.tacotron2.text import text_to_sequence tacotron2_checkpoint = torch.load(os.path.join(model_dir, 'nvidia_tacotron2pyt_fp32_20190427.pth')) tacotron2_state_dict = self._unwrap_distributed(tacotron2_checkpoint['state_dict']) tacotron2_config = tacotron2_checkpoint['config'] self.tacotron2_model = tacotron2.Tacotron2(**tacotron2_config) self.tacotron2_model.load_state_dict(tacotron2_state_dict) self.tacotron2_model.text_to_sequence = text_to_sequence self.tacotron2_model.to(self.device)
def load_tacotron2(weightpath='checkpoints/tacotron2_20200314.pth'): ckpt = torch.load(weightpath) state_dict = ckpt['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) config = ckpt['config'] tacotron2_model = tacotron2.Tacotron2(**config) tacotron2_model.load_state_dict(state_dict) tacotron2_model.text_to_sequence = text_to_sequence return tacotron2_model
def load_tacotron2 (fp16=fp16, device=device): '''Constructs a Tacotron 2 model (nn.module with additional infer(input) method). For detailed information on model input and output, training recipies, inference and performance visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com Args (type[, default value]): pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset. model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16') n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder) p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder) max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference ''' from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2 from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float, lstmcell_to_float from PyTorch.SpeechSynthesis.Tacotron2.tacotron2.text import text_to_sequence ckpt_file = tacotron2_path ckpt = torch.load(ckpt_file, map_location=device) state_dict = ckpt['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) config = ckpt['config'] config['device'] = device m = tacotron2.Tacotron2(**config) if fp16: m = batchnorm_to_float(m.half()) m = lstmcell_to_float(m) m.load_state_dict(state_dict) m.text_to_sequence = text_to_sequence return m
def nvidia_tacotron2(pretrained=True, **kwargs): """Constructs a Tacotron 2 model (nn.module with additional infer(input) method). For detailed information on model input and output, training recipies, inference and performance visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com Args (type[, default value]): pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset. model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16') n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder) p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder) max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference """ from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2 from PyTorch.SpeechSynthesis.Tacotron2.models import lstmcell_to_float, batchnorm_to_float fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16" if pretrained: if fp16: checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp16-pyt-20190306' else: checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp32-pyt-20190306' ckpt_file = "tacotron2_ckpt.pt" urllib.request.urlretrieve(checkpoint, ckpt_file) ckpt = torch.load(ckpt_file) state_dict = ckpt['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) config = ckpt['config'] else: config = { 'mask_padding': False, 'n_mel_channels': 80, 'n_symbols': 148, 'symbols_embedding_dim': 512, 'encoder_kernel_size': 5, 'encoder_n_convolutions': 3, 'encoder_embedding_dim': 512, 'attention_rnn_dim': 1024, 'attention_dim': 128, 'attention_location_n_filters': 32, 'attention_location_kernel_size': 31, 'n_frames_per_step': 1, 'decoder_rnn_dim': 1024, 'prenet_dim': 256, 'max_decoder_steps': 1000, 'gate_threshold': 0.5, 'p_attention_dropout': 0.1, 'p_decoder_dropout': 0.1, 'postnet_embedding_dim': 512, 'postnet_kernel_size': 5, 'postnet_n_convolutions': 5, 'decoder_no_early_stopping': False } for k, v in kwargs.items(): if k in config.keys(): config[k] = v m = tacotron2.Tacotron2(**config) if fp16: m = batchnorm_to_float(m.half()) m = lstmcell_to_float(m) if pretrained: m.load_state_dict(state_dict) return m