Beispiel #1
0
 def batchnorm_to_float(module):
     """Converts batch norm to FP32"""
     if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
         module.float()
     for child in module.children():
         batchnorm_to_float(child)
     return module
Beispiel #2
0
def load_waveglow (fp16=fp16):
  '''Constructs a WaveGlow model (nn.module with additional infer(input) method).
    For detailed information on model input and output, training recipies, inference and performance
    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com

    Args:
        pretrained (bool): If True, returns a model pretrained on LJ Speech dataset.
        model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
  '''
  from PyTorch.SpeechSynthesis.Tacotron2.waveglow import model as waveglow
  from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float

  ckpt_file = waveglow_path
  ckpt = torch.load(ckpt_file, map_location=device)
  
  state_dict = ckpt['state_dict']
  if checkpoint_from_distributed(state_dict):
    state_dict = unwrap_distributed(state_dict)
  config = ckpt['config']

  m = waveglow.WaveGlow(**config)

  if fp16:
    m = batchnorm_to_float(m.half())
    for mat in m.convinv:
      mat.float()

  m.load_state_dict(state_dict)

  return m
def nvidia_waveglow(pretrained=True, **kwargs):
    """Constructs a WaveGlow model (nn.module with additional infer(input) method).
    For detailed information on model input and output, training recipies, inference and performance
    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com

    Args:
        pretrained (bool): If True, returns a model pretrained on LJ Speech dataset.
        model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
    """

    from PyTorch.SpeechSynthesis.Tacotron2.waveglow import model as waveglow
    from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float

    fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"

    if pretrained:
        if fp16:
            checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp16-pyt-20190306'
        else:
            checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp32-pyt-20190306'
        ckpt_file = "waveglow_ckpt.pt"
        urllib.request.urlretrieve(checkpoint, ckpt_file)
        ckpt = torch.load(ckpt_file)
        state_dict = ckpt['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)
        config = ckpt['config']
    else:
        config = {
            'n_mel_channels': 80,
            'n_flows': 12,
            'n_group': 8,
            'n_early_every': 4,
            'n_early_size': 2,
            'WN_config': {
                'n_layers': 8,
                'kernel_size': 3,
                'n_channels': 512
            }
        }
        for k, v in kwargs.items():
            if k in config.keys():
                config[k] = v
            elif k in config['WN_config'].keys():
                config['WN_config'][k] = v

    m = waveglow.WaveGlow(**config)

    if fp16:
        m = batchnorm_to_float(m.half())
        for mat in m.convinv:
            mat.float()

    if pretrained:
        m.load_state_dict(state_dict)

    return m
Beispiel #4
0
def nvidia_ssd(pretrained=True, **kwargs):
    """Constructs an SSD300 model.
    For detailed information on model input and output, training recipies, inference and performance
    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com

    Args:
        pretrained (bool, True): If True, returns a model pretrained on COCO dataset.
        model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
    """

    from PyTorch.Detection.SSD.src import model as ssd

    fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
    force_reload = "force_reload" in kwargs and kwargs["force_reload"]

    m = ssd.SSD300()
    if fp16:
        m = m.half()

        def batchnorm_to_float(module):
            """Converts batch norm to FP32"""
            if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
                module.float()
            for child in module.children():
                batchnorm_to_float(child)
            return module

        m = batchnorm_to_float(m)

    if pretrained:
        if fp16:
            checkpoint = 'https://developer.nvidia.com/joc-ssd-fp16-pyt-20190225'
        else:
            checkpoint = 'https://developer.nvidia.com/joc-ssd-fp32-pyt-20190225'
        ckpt_file = os.path.basename(checkpoint)
        if not os.path.exists(ckpt_file) or force_reload:
            sys.stderr.write(
                'Downloading checkpoint from {}\n'.format(checkpoint))
            urllib.request.urlretrieve(checkpoint, ckpt_file)
        ckpt = torch.load(ckpt_file)
        ckpt = ckpt['model']
        if checkpoint_from_distributed(ckpt):
            ckpt = unwrap_distributed(ckpt)
        m.load_state_dict(ckpt)
    return m
Beispiel #5
0
def load_tacotron2 (fp16=fp16, device=device):
  '''Constructs a Tacotron 2 model (nn.module with additional infer(input) method).
    For detailed information on model input and output, training recipies, inference and performance
    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com

    Args (type[, default value]):
        pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset.
        model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
        n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see
                              https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py
        p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder)
        p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder)
        max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference
  '''
  from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2
  from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float, lstmcell_to_float
  from PyTorch.SpeechSynthesis.Tacotron2.tacotron2.text import text_to_sequence

  ckpt_file = tacotron2_path
  ckpt = torch.load(ckpt_file, map_location=device)
  
  state_dict = ckpt['state_dict']
  if checkpoint_from_distributed(state_dict):
    state_dict = unwrap_distributed(state_dict)
  config = ckpt['config']
  config['device'] = device

  m = tacotron2.Tacotron2(**config)

  if fp16:
    m = batchnorm_to_float(m.half())
    m = lstmcell_to_float(m)

  m.load_state_dict(state_dict)
  m.text_to_sequence = text_to_sequence

  return m
def nvidia_tacotron2(pretrained=True, **kwargs):
    """Constructs a Tacotron 2 model (nn.module with additional infer(input) method).
    For detailed information on model input and output, training recipies, inference and performance
    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com

    Args (type[, default value]):
        pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset.
        model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
        n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see
                              https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py
        p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder)
        p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder)
        max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference
    """

    from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2
    from PyTorch.SpeechSynthesis.Tacotron2.models import lstmcell_to_float, batchnorm_to_float

    fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"

    if pretrained:
        if fp16:
            checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp16-pyt-20190306'
        else:
            checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp32-pyt-20190306'
        ckpt_file = "tacotron2_ckpt.pt"
        urllib.request.urlretrieve(checkpoint, ckpt_file)
        ckpt = torch.load(ckpt_file)
        state_dict = ckpt['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)
        config = ckpt['config']
    else:
        config = {
            'mask_padding': False,
            'n_mel_channels': 80,
            'n_symbols': 148,
            'symbols_embedding_dim': 512,
            'encoder_kernel_size': 5,
            'encoder_n_convolutions': 3,
            'encoder_embedding_dim': 512,
            'attention_rnn_dim': 1024,
            'attention_dim': 128,
            'attention_location_n_filters': 32,
            'attention_location_kernel_size': 31,
            'n_frames_per_step': 1,
            'decoder_rnn_dim': 1024,
            'prenet_dim': 256,
            'max_decoder_steps': 1000,
            'gate_threshold': 0.5,
            'p_attention_dropout': 0.1,
            'p_decoder_dropout': 0.1,
            'postnet_embedding_dim': 512,
            'postnet_kernel_size': 5,
            'postnet_n_convolutions': 5,
            'decoder_no_early_stopping': False
        }
        for k, v in kwargs.items():
            if k in config.keys():
                config[k] = v

    m = tacotron2.Tacotron2(**config)

    if fp16:
        m = batchnorm_to_float(m.half())
        m = lstmcell_to_float(m)

    if pretrained:
        m.load_state_dict(state_dict)

    return m