Beispiel #1
0
    def load_model(self):
        print("loading model...")
        args = self.args
        parser = self.parser

        tacotron2 = load_and_setup_model('Tacotron2',
                                         parser,
                                         args.tacotron2,
                                         args.amp_run,
                                         args.cpu_run,
                                         forward_is_infer=True)
        waveglow = load_and_setup_model('WaveGlow',
                                        parser,
                                        args.waveglow,
                                        args.amp_run,
                                        args.cpu_run,
                                        forward_is_infer=True)

        if args.cpu_run:
            denoiser = Denoiser(waveglow, args.cpu_run)
        else:
            denoiser = Denoiser(waveglow, args.cpu_run).cuda()

        jitted_tacotron2 = torch.jit.script(tacotron2)

        print("warming up...")
        if args.include_warmup:
            if args.cpu_run:
                sequence = torch.randint(low=0,
                                         high=148,
                                         size=(1, 50),
                                         dtype=torch.long)
                input_lengths = torch.IntTensor([sequence.size(1)]).long()
            else:
                sequence = torch.randint(low=0,
                                         high=148,
                                         size=(1, 50),
                                         dtype=torch.long).cuda()
                input_lengths = torch.IntTensor([sequence.size(1)
                                                 ]).cuda().long()

            for i in range(3):
                with torch.no_grad():
                    mel, mel_lengths, _ = jitted_tacotron2(
                        sequence, input_lengths)
                    _ = waveglow(mel)

        self.jitted_tacotron2 = jitted_tacotron2
        self.waveglow = waveglow
        self.denoiser = denoiser
        print("done...")
def _denoiser(waveglow,
              filter_length=1024,
              n_overlap=4,
              win_length=1024,
              mode='zeros'):
    denoiser = Denoiser(waveglow, filter_length, n_overlap, win_length, mode)
    return denoiser
Beispiel #3
0
def setup():
    global model, waveglow, denoiser, hparams

    hparams = create_hparams()
    hparams.sampling_rate = 22050

    checkpoint_path = downloads.download_from_gdrive(
        gdrive_fileid='1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA',
        output_path='tacotron2/tacotron2_statedict.pt')

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()

    waveglow_path = downloads.download_from_gdrive(
        gdrive_fileid='1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF',
        output_path='tacotron2/waveglow_256channels_universal_v5.pt')

    with submodules.localimport('submodules/tacotron2/waveglow') as _importer:
        waveglow_ = torch.load(waveglow_path)
        waveglow = waveglow_['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
Beispiel #4
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
def init_vocoder():
    waveglow_path = '../waveglow_256channels_universal_v5.pt'
    waveglow = torch.load(waveglow_path)['model']
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
    return waveglow, denoiser
Beispiel #6
0
def load_denoiser(waveglow_path):
    """
    Library:
        from waveglow.denoiser import Denoiser
    """
    waveglow_for_denoiser = torch.load(waveglow_path)['model']
    waveglow_for_denoiser.cuda()
    denoiser_mode = 'zeros'
    denoiser = Denoiser(waveglow_for_denoiser, mode=denoiser_mode)
    return denoiser
def load_models(hparams, checkpoint_path, waveglow_path):
    print("load models...")
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    model.cuda().eval()

    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
    print("loaded!")

    return model, waveglow
def init_model():
    hparams = create_hparams()

    checkpoint_path = "checkpoints/mellotron_libritts.pt"
    tacotron = load_model(hparams).cpu().eval()
    tacotron.load_state_dict(
        torch.load(checkpoint_path,
                   map_location=torch.device('cpu'))['state_dict'])

    waveglow_path = 'checkpoints/waveglow_256channels_v4.pt'
    waveglow = torch.load(
        waveglow_path, map_location=torch.device('cpu'))['model'].cpu().eval()
    denoiser = Denoiser(waveglow).cpu().eval()
    return (tacotron, waveglow, denoiser)
Beispiel #9
0
def load_tts_vocoder_models(tacotron_checkpoint_path,
                            waveglow_checkpoint_path):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict'])
    _ = model.cuda().eval()

    waveglow = torch.load(waveglow_checkpoint_path)['model']
    waveglow.cuda().eval()
    #for k in waveglow.convinv:
    #    k.float()
    denoiser = Denoiser(waveglow)
    return model, waveglow, denoiser, hparams
Beispiel #10
0
def load_waveglow_model(model_path: str, device: torch.device):
    # this is required for pickle to see glow module
    sys.path.append("tts_dev/waveglow/")

    waveglow = torch.load(model_path, map_location=device)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)

    waveglow.eval()
    if device.type != 'cpu':
        waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()

    denoiser = Denoiser(waveglow)

    return waveglow, denoiser
Beispiel #11
0
 def __init__(self, ckpt, wglw, n_speakers=123):
     print("[Loading Model]")
     self.ckpt = ckpt
     self.hparams = create_hparams()
     self.hparams.n_speakers = n_speakers
     self.stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              self.hparams.n_mel_channels,
                              self.hparams.sampling_rate,
                              self.hparams.mel_fmin, self.hparams.mel_fmax)
     self.mellotron = load_model(self.hparams).cuda().eval()
     self.waveglow = torch.load(wglw)['model'].cuda().eval()
     self.denoiser = Denoiser(self.waveglow).cuda().eval()
     self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
     self.mellotron.load_state_dict(torch.load(ckpt)['state_dict'])
     print('[Loaded Model]')
def load_tts_model(checkpoint_path=None, waveglow_path=None):

    # set-up params
    hparams = create_hparams()

    # load model from checkpoint
    model = load_model(hparams)
    model.load_state_dict(
        torch.load(checkpoint_path, map_location='cpu')['state_dict'])
    _ = model.eval()

    # Load WaveGlow for mel2audio synthesis and denoiser
    waveglow = torch.load(waveglow_path, map_location='cpu')['model']
    waveglow.eval()

    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)

    return model, denoiser, waveglow, hparams
Beispiel #13
0
        os.mkdir(output_dir)
    logging.basicConfig(filename=os.path.join(output_dir, 'debug.log'),
                        level=logging.DEBUG)
    logging.info('Output dir: %s', output_dir)

    # Parameters
    teacher_utt_path = args.teacher_utterance_path
    checkpoint_path = args.ppg2mel_model
    waveglow_path = args.waveglow_model
    is_clip = False  # Set to True to control the output length of AC.
    fs = 16000
    waveglow_sigma = 0.6
    waveglow_for_denoiser = torch.load(waveglow_path)['model']
    waveglow_for_denoiser.cuda()
    denoiser_mode = 'zeros'
    denoiser = Denoiser(waveglow_for_denoiser, mode=denoiser_mode)
    denoiser_strength = 0.005
    # End of parameters

    logging.debug('Tacotron: %s', checkpoint_path)
    logging.debug('Waveglow: %s', waveglow_path)
    logging.debug('AM: SI model')
    logging.debug('is_clip: %d', is_clip)
    logging.debug('Fs: %d', fs)
    logging.debug('Sigma: %f', waveglow_sigma)
    logging.debug('Denoiser strength: %f', denoiser_strength)
    logging.debug('Denoiser mode: %s', denoiser_mode)

    hparams = create_hparams_stage()
    taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                             hparams.win_length, hparams.n_acoustic_feat_dims,
Beispiel #14
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    tacotron2 = load_and_setup_model('Tacotron2',
                                     parser,
                                     args.tacotron2,
                                     args.amp_run,
                                     forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.amp_run,
                                    forward_is_infer=True)
    denoiser = Denoiser(waveglow).cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    if args.include_warmup:
        sequence = torch.randint(low=0,
                                 high=148,
                                 size=(1, 50),
                                 dtype=torch.long).cuda()
        input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
                _ = waveglow(mel)

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
        audios = waveglow(mel, sigma=args.sigma_infer)
        audios = audios.float()
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    print("Stopping after", mel.size(2), "decoder steps")
    tacotron2_infer_perf = mel.size(0) * mel.size(
        2) / measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0) * audios.size(
        1) / measurements['waveglow_time']

    DLLogger.log(step=0,
                 data={"tacotron2_items_per_sec": tacotron2_infer_perf})
    DLLogger.log(step=0,
                 data={"tacotron2_latency": measurements['tacotron2_time']})
    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
    DLLogger.log(step=0,
                 data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0,
                 data={
                     "latency": (measurements['tacotron2_time'] +
                                 measurements['waveglow_time'])
                 })

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i] * args.stft_hop_length]
        audio = audio / torch.max(torch.abs(audio))
        audio_path = args.output + "audio_" + str(i) + ".wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
Beispiel #15
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()

    if args.p_arpabet > 0.0:
        cmudict.initialize(args.cmudict_path, keep_ambiguous=True)

    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if args.output is not None:
        Path(args.output).mkdir(parents=False, exist_ok=True)

    log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json'))
    log_fpath = unique_log_fpath(log_fpath)
    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
                            StdOutBackend(Verbosity.VERBOSE,
                                          metric_format=stdout_metric_format)])
    init_inference_metadata()
    [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]

    device = torch.device('cuda' if args.cuda else 'cpu')

    if args.fastpitch != 'SKIP':
        generator = load_and_setup_model(
            'FastPitch', parser, args.fastpitch, args.amp, device,
            unk_args=unk_args, forward_is_infer=True, ema=args.ema,
            jitable=args.torchscript)

        if args.torchscript:
            generator = torch.jit.script(generator)
    else:
        generator = None

    if args.waveglow != 'SKIP':
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            waveglow = load_and_setup_model(
                'WaveGlow', parser, args.waveglow, args.amp, device,
                unk_args=unk_args, forward_is_infer=True, ema=args.ema)
        denoiser = Denoiser(waveglow).to(device)
        waveglow = getattr(waveglow, 'infer', waveglow)
    else:
        waveglow = None

    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    fields = load_fields(args.input)
    batches = prepare_input_sequence(
        fields, device, args.symbol_set, args.text_cleaners, args.batch_size,
        args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet)

    # Use real data rather than synthetic - FastPitch predicts len
    for _ in tqdm(range(args.warmup_steps), 'Warmup'):
        with torch.no_grad():
            if generator is not None:
                b = batches[0]
                mel, *_ = generator(b['text'])
            if waveglow is not None:
                audios = waveglow(mel, sigma=args.sigma_infer).float()
                _ = denoiser(audios, strength=args.denoising_strength)

    gen_measures = MeasureTime(cuda=args.cuda)
    waveglow_measures = MeasureTime(cuda=args.cuda)

    gen_kw = {'pace': args.pace,
              'speaker': args.speaker,
              'pitch_tgt': None,
              'pitch_transform': build_pitch_transformation(args)}

    if args.torchscript:
        gen_kw.pop('pitch_transform')
        print('NOTE: Pitch transforms are disabled with TorchScript')

    all_utterances = 0
    all_samples = 0
    all_letters = 0
    all_frames = 0

    reps = args.repeats
    log_enabled = reps == 1
    log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None

    for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)):
        for b in batches:
            if generator is None:
                log(rep, {'Synthesizing from ground truth mels'})
                mel, mel_lens = b['mel'], b['mel_lens']
            else:
                with torch.no_grad(), gen_measures:
                    mel, mel_lens, *_ = generator(b['text'], **gen_kw)

                gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1]
                all_letters += b['text_lens'].sum().item()
                all_frames += mel.size(0) * mel.size(2)
                log(rep, {"fastpitch_frames/s": gen_infer_perf})
                log(rep, {"fastpitch_latency": gen_measures[-1]})

                if args.save_mels:
                    for i, mel_ in enumerate(mel):
                        m = mel_[:, :mel_lens[i].item()].permute(1, 0)
                        fname = b['output'][i] if 'output' in b else f'mel_{i}.npy'
                        mel_path = Path(args.output, Path(fname).stem + '.npy')
                        np.save(mel_path, m.cpu().numpy())

            if waveglow is not None:
                with torch.no_grad(), waveglow_measures:
                    audios = waveglow(mel, sigma=args.sigma_infer)
                    audios = denoiser(audios.float(),
                                      strength=args.denoising_strength
                                      ).squeeze(1)

                all_utterances += len(audios)
                all_samples += sum(audio.size(0) for audio in audios)
                waveglow_infer_perf = (
                    audios.size(0) * audios.size(1) / waveglow_measures[-1])

                log(rep, {"waveglow_samples/s": waveglow_infer_perf})
                log(rep, {"waveglow_latency": waveglow_measures[-1]})

                if args.output is not None and reps == 1:
                    for i, audio in enumerate(audios):
                        audio = audio[:mel_lens[i].item() * args.stft_hop_length]

                        if args.fade_out:
                            fade_len = args.fade_out * args.stft_hop_length
                            fade_w = torch.linspace(1.0, 0.0, fade_len)
                            audio[-fade_len:] *= fade_w.to(audio.device)

                        audio = audio / torch.max(torch.abs(audio))
                        fname = b['output'][i] if 'output' in b else f'audio_{i}.wav'
                        audio_path = Path(args.output, fname)
                        write(audio_path, args.sampling_rate, audio.cpu().numpy())

            if generator is not None and waveglow is not None:
                log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])})

    log_enabled = True
    if generator is not None:
        gm = np.sort(np.asarray(gen_measures))
        rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate)
        log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()})
        log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()})
        log((), {"avg_fastpitch_latency": gm.mean()})
        log((), {"avg_fastpitch_RTF": rtf})
        log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()})
        log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()})
        log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()})
    if waveglow is not None:
        wm = np.sort(np.asarray(waveglow_measures))
        rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate)
        log((), {"avg_waveglow_samples/s": all_samples / wm.sum()})
        log((), {"avg_waveglow_latency": wm.mean()})
        log((), {"avg_waveglow_RTF": rtf})
        log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()})
        log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()})
        log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()})
    if generator is not None and waveglow is not None:
        m = gm + wm
        rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate)
        log((), {"avg_samples/s": all_samples / m.sum()})
        log((), {"avg_letters/s": all_letters / m.sum()})
        log((), {"avg_latency": m.mean()})
        log((), {"avg_RTF": rtf})
        log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()})
        log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()})
        log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()})
    DLLogger.flush()
Beispiel #16
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()
    use_custom_naming = args.custom_name
    input_path = args.input
    text_cleaners = args.text_cleaners

    check_directory_and_create(args.output, exists_warning=True)

    # import pdb; pdb.set_trace()
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    if args.use_extracted_mels:
        print(f"mel found in {args.mel_path}")
        mel = torch.load(args.mel_path)
        mel = mel.unsqueeze(0)
        print(f"The size of the mel we just loaded is {mel.shape}")
        audios = apply_griffin_lim(args, mel)
    else:
        tacotron2 = load_and_setup_model('Tacotron2',
                                         parser,
                                         args.tacotron2,
                                         args.fp16,
                                         args.cpu,
                                         forward_is_infer=True)

        if not args.use_griffin_lim:
            waveglow = \
                load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.fp16, args.cpu, forward_is_infer=True)
            denoiser = Denoiser(waveglow)
            if not args.cpu:
                denoiser.cuda()

        jitted_tacotron2 = torch.jit.script(tacotron2)

        texts = []
        try:
            f = open(args.input, 'r')
            texts = f.readlines()
        except:
            print("Could not read file")
            sys.exit(1)

        if args.include_warmup and (not args.use_griffin_lim):
            sequence = torch.randint(low=0, high=148, size=(1, 50)).long()
            input_lengths = torch.IntTensor([sequence.size(1)]).long()
            if not args.cpu:
                sequence = sequence.cuda()
                input_lengths = input_lengths.cuda()
            for i in range(3):
                with torch.no_grad():
                    mel, mel_lengths, _ = jitted_tacotron2(
                        sequence, input_lengths)
                    _ = waveglow(mel)

        measurements = {}

        sequences_padded, input_lengths = \
            prepare_input_sequence(texts, args.cpu, text_cleaners)

        with torch.no_grad(), MeasureTime(measurements, "tacotron2_time",
                                          args.cpu):
            mel, mel_lengths, alignments = jitted_tacotron2(
                sequences_padded, input_lengths)

        if args.use_griffin_lim:
            print(f"The size of the generated mel spec is {mel.shape}")
            audios = apply_griffin_lim(args, mel)
            # import pdb; pdb.set_trace()
            # audios = audios.cpu().numpy()
            #audio = audio.astype('int16')
            # audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename))
            # write(audio_path, hparams.sampling_rate, audio)
            # print(audio_path)
        else:
            with torch.no_grad(), MeasureTime(measurements, "waveglow_time",
                                              args.cpu):
                audios = waveglow(mel, sigma=args.sigma_infer)
                audios = audios.float()
            with torch.no_grad(), MeasureTime(measurements, "denoiser_time",
                                              args.cpu):
                audios = denoiser(audios,
                                  strength=args.denoising_strength).squeeze(1)

            print("Stopping after", mel.size(2), "decoder steps")

            tacotron2_infer_perf = mel.size(0) * mel.size(
                2) / measurements['tacotron2_time']
            waveglow_infer_perf = audios.size(0) * audios.size(
                1) / measurements['waveglow_time']

            DLLogger.log(
                step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
            DLLogger.log(
                step=0,
                data={"tacotron2_latency": measurements['tacotron2_time']})
            DLLogger.log(step=0,
                         data={"waveglow_items_per_sec": waveglow_infer_perf})
            DLLogger.log(
                step=0,
                data={"waveglow_latency": measurements['waveglow_time']})
            DLLogger.log(
                step=0,
                data={"denoiser_latency": measurements['denoiser_time']})
            DLLogger.log(step=0,
                         data={
                             "latency": (measurements['tacotron2_time'] +
                                         measurements['waveglow_time'] +
                                         measurements['denoiser_time'])
                         })

    for i, audio in enumerate(audios):
        if use_custom_naming:
            if args.use_extracted_mels:
                custom_name = (args.mel_path.split("/")[-1]).split(".")[0]
            else:
                custom_name = (input_path.split("/")[-1]).split(".")[0]
            custom_path = os.path.join(args.output, custom_name)
            if not args.use_extracted_mels:
                # save alignment
                import pdb
                pdb.set_trace()
                plt.imshow(alignments[i].float().data.cpu().numpy().T,
                           aspect="auto",
                           origin="lower")
                figure_path = custom_path + "_alignment.png"
                plt.savefig(figure_path)
                meltitle = "_predicted"
            else:
                meltitle = "_extracetd"
                # save predicted mel
            # import pdb; pdb.set_trace()
            plot_mel_spectrogram(
                mel,
                title=meltitle,
                dirname=custom_path,
                append_name=True,
                load_mel_path=False,
                # load_mel_path=True
            )
            # save generated audio
            # if not args.use_griffin_lim:
            if not args.use_extracted_mels:
                audio = audio[:mel_lengths[i] * args.stft_hop_length]
            audio = audio / torch.max(torch.abs(audio))
            # custom_name = (input_path.split("/")[-1]).split(".")[0]
            audio_path = custom_path + ".wav"
            write(audio_path, args.sampling_rate, audio.cpu().numpy())
        else:
            plt.imshow(alignments[i].float().data.cpu().numpy().T,
                       aspect="auto",
                       origin="lower")
            # figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png"
            figure_path = "alignment_" + str(i) + "_" + args.suffix + ".png"
            # import pdb; pdb.set_trace()
            figure_path = os.path.join(args.output, figure_path)
            plt.savefig(figure_path)
            audio = audio[:mel_lengths[i] * args.stft_hop_length]
            audio = audio / torch.max(torch.abs(audio))
            audio_path = \
                os.path.join(args.output, "audio_"+str(i)+"_"+args.suffix+".wav")
            write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
Beispiel #17
0
def get_evaluator(evaluator_classname: str,
                  encoder_hparams: HParams,
                  encoder_checkpoint_path: str,
                  vocoder_hparams: HParams,
                  vocoder_checkpoint_path: str,
                  use_denoiser: bool = True,
                  device: str = 'cpu') -> BaseEvaluator:
    """
    Function for creation instance of Evaluator for syntesis
    Args:
        evaluator_classname: `str` class of evaluator
        encoder_hparams: `HParams` with tacotron2 meta
        encoder_checkpoint_path: `str` path to tacotron2 checkpoint
        vocoder_hparams: `HParams` with waveglow meta
        vocoder_checkpoint_path: `str` path to waveglow checkpoint
        use_denoiser: `bool` use or not postprocessing denoising
        device: `str` identifier for device to use

    Returns:
        `BaseEvaluator` instance
    """
    encoder = Factory.get_object(
        f"tacotron2.models.{encoder_hparams['model_class_name']}",
        encoder_hparams)
    encoder.load_state_dict(
        torch.load(encoder_checkpoint_path,
                   map_location=device)['model_state_dict'])
    encoder.to(device)

    vocoder = Factory.get_object(
        f"waveglow.models.{vocoder_hparams['model_class_name']}",
        vocoder_hparams)
    vocoder_loaded_weights = torch.load(vocoder_checkpoint_path,
                                        map_location=device)

    if 'model_state_dict' in vocoder_loaded_weights:
        vocoder.load_state_dict(
            torch.load(vocoder_checkpoint_path,
                       map_location=device)['model_state_dict'])
    else:
        vocoder.load_state_dict(
            torch.load(vocoder_checkpoint_path, map_location=device))
    vocoder.to(device)

    if use_denoiser:
        denoiser = Denoiser(vocoder, device=device)
    else:
        denoiser = None

    tokenizer = Factory.get_object(
        f"tacotron2.tokenizers.{encoder_hparams['tokenizer_class_name']}")

    evaluator = Factory.get_object(
        f"tacotron2.evaluators.{evaluator_classname}",
        encoder=encoder,
        vocoder=vocoder,
        tokenizer=tokenizer,
        denoiser=denoiser,
        device=device)

    return evaluator
Beispiel #18
0
    hparams = create_hparams()

    # Load model from checkpoint
    checkpoint_path = "./outdir/4/checkpoint_57500"
    model, _ = load_Tacotron2(hparams, device)
    model.load_state_dict(
        torch.load(checkpoint_path, map_location=device)['state_dict'])
    _ = model.eval()

    # Load WaveGlow for mel2audio synthesis
    if device == torch.device('cuda'):
        sys.path.insert(0, "waveglow/")  # To look glow(original version) first
        from waveglow.denoiser import Denoiser
    else:
        sys.path.insert(
            0, "waveglow_cpu_components/")  # To look glow(cpu version) first
        from waveglow_cpu_components.denoiser import Denoiser
    waveglow_path = './waveglow/waveglow_170000_22k'
    waveglow = torch.load(waveglow_path, map_location=device)['model']
    waveglow.eval()
    denoiser = Denoiser(waveglow).to(device)

    # Start Server
    tornado_logger = TornadoLogger()
    logger = tornado_logger.logger
    logger.info("Server Start")

    app = make_app(model, waveglow, hparams, denoiser, device, logger)
    app.listen(8888)
    tornado.ioloop.IOLoop.current().start()
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()

    # 경상 "/home/ubuntu/Workspaces/thien/nvidia-tacotron-je/outdir/male/gyeongsang/waveglow_gyeongsang_266000"
    # 제주 "/home/ubuntu/Workspaces/thien/nvidia-tacotron-je/outdir/waveglow_jeju_146000"
    # 전라 "/home/ubuntu/Workspaces/thien/nvidia-tacotron-jeonla/outdir/waveglow_240000"
    waveglow_path = "/home/ubuntu/Workspaces/thien/nvidia-tacotron-je/outdir/male/gyeongsang/waveglow_gyeongsang_266000"
    taco = checkpoint_path.split('_')[-1]
    wave = waveglow_path.split('_')[-1]

    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
    # 텍스트 넣기
    txt_list = [
        '여러분이 있었기 땜시 즈희가 잘할 수 있었십니다', '니는 어떤 제목 드라마를 좋아하노?',
        '내 친구들은 다 휴가 갔습니더', '어. 그라모 다덜 이메일 주소들 좀 도.', '내는 이 책으로 열심히 공부하고 싶어예',
        '이 둘은 같은 디자인인데 사이즈가 다릅니더', '온라인상에서도 마찬가지입니더',
        '고객분들에 한해 무료로 배포하는거 아닙니꺼?', '애들이 묵기에는 쪼매 그렇네예.',
        '당신은 기차역에서 열차를 잘못 탔습니더', '그녀는 매사에 정확한 사람입니더',
        '갈비탕을 맛있게 하는 곳이 있으믄 거 가고 싶데이.', '건물 중에 어데 갈라꼬 하시는건가예?',
        '훨씬 나아지긴 했는데 지금은 너무 밝아서 파이다.', '예, 문제가 있으신가예?',
        '당신은 내랑 꼭 같이 가지 않아도 됩니더', '당신 마이 아파 보이는데 병원에 가보는 게 어떻습니꺼?',
        '저는 제가 결혼하게 되어가 기쁩니더', '오늘이 물리치료 몇 번째 받으시는 긴가예?',
        '영어보다 중국어로 말씀을 더 잘하시네예'
    ]
    for i, text in enumerate(txt_list):
        # text = "야. 도로모깡도 왜정시대나 낫주. 도로모깡도 엇일 땐양 허벅에."
Beispiel #20
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'})

    if args.output is not None:
        Path(args.output).mkdir(parents=False, exist_ok=True)

    device = torch.device('cuda' if args.cuda else 'cpu')

    if args.fastpitch is not None:
        generator = load_and_setup_model('FastPitch',
                                         parser,
                                         args.fastpitch,
                                         args.amp_run,
                                         device,
                                         unk_args=unk_args,
                                         forward_is_infer=True,
                                         ema=args.ema,
                                         jitable=args.torchscript)

        if args.torchscript:
            generator = torch.jit.script(generator)
    else:
        generator = None

    if args.waveglow is not None:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            waveglow = load_and_setup_model('WaveGlow',
                                            parser,
                                            args.waveglow,
                                            args.amp_run,
                                            device,
                                            unk_args=unk_args,
                                            forward_is_infer=True,
                                            ema=args.ema)
        denoiser = Denoiser(waveglow).to(device)
        waveglow = getattr(waveglow, 'infer', waveglow)
    else:
        waveglow = None

    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    fields = load_fields(args.input)
    batches = prepare_input_sequence(fields,
                                     device,
                                     args.batch_size,
                                     args.dataset_path,
                                     load_mels=(generator is None))

    if args.include_warmup:
        # Use real data rather than synthetic - FastPitch predicts len
        for i in range(3):
            with torch.no_grad():
                if generator is not None:
                    b = batches[0]
                    mel, *_ = generator(b['text'], b['text_lens'])
                if waveglow is not None:
                    audios = waveglow(mel, sigma=args.sigma_infer).float()
                    _ = denoiser(audios, strength=args.denoising_strength)

    gen_measures = MeasureTime()
    waveglow_measures = MeasureTime()

    gen_kw = {
        'pace': args.pace,
        'pitch_tgt': None,
        'pitch_transform': build_pitch_transformation(args)
    }

    if args.torchscript:
        gen_kw.pop('pitch_transform')

    all_utterances = 0
    all_samples = 0
    all_letters = 0
    all_frames = 0

    reps = args.repeats
    log_enabled = reps == 1
    log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None

    for repeat in (tqdm.tqdm(range(reps)) if reps > 1 else range(reps)):
        for b in batches:
            if generator is None:
                log(0, {'Synthesizing from ground truth mels'})
                mel, mel_lens = b['mel'], b['mel_lens']
            else:
                with torch.no_grad(), gen_measures:
                    mel, mel_lens, *_ = generator(b['text'], b['text_lens'],
                                                  **gen_kw)

                gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1]
                all_letters += b['text_lens'].sum().item()
                all_frames += mel.size(0) * mel.size(2)
                log(0, {"generator_frames_per_sec": gen_infer_perf})
                log(0, {"generator_latency": gen_measures[-1]})

            if waveglow is not None:
                with torch.no_grad(), waveglow_measures:
                    audios = waveglow(mel, sigma=args.sigma_infer)
                    audios = denoiser(
                        audios.float(),
                        strength=args.denoising_strength).squeeze(1)

                all_utterances += len(audios)
                all_samples += sum(audio.size(0) for audio in audios)
                waveglow_infer_perf = (audios.size(0) * audios.size(1) /
                                       waveglow_measures[-1])

                log(0, {"waveglow_samples_per_sec": waveglow_infer_perf})
                log(0, {"waveglow_latency": waveglow_measures[-1]})

                if args.output is not None and reps == 1:
                    for i, audio in enumerate(audios):
                        audio = audio[:mel_lens[i].item() *
                                      args.stft_hop_length]

                        if args.fade_out:
                            fade_len = args.fade_out * args.stft_hop_length
                            fade_w = torch.linspace(1.0, 0.0, fade_len)
                            audio[-fade_len:] *= fade_w.to(audio.device)

                        audio = audio / torch.max(torch.abs(audio))
                        fname = b['output'][
                            i] if 'output' in b else f'audio_{i}.wav'
                        audio_path = Path(args.output, fname)
                        write(audio_path, args.sampling_rate,
                              audio.cpu().numpy())

            if generator is not None and waveglow is not None:
                log(0, {"latency": (gen_measures[-1] + waveglow_measures[-1])})

    log_enabled = True
    if generator is not None:
        gm = np.sort(np.asarray(gen_measures))
        log('avg', {"generator letters/s": all_letters / gm.sum()})
        log('avg', {"generator_frames/s": all_frames / gm.sum()})
        log('avg', {"generator_latency": gm.mean()})
        log('90%', {
            "generator_latency":
            gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()
        })
        log('95%', {
            "generator_latency":
            gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()
        })
        log('99%', {
            "generator_latency":
            gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()
        })
    if waveglow is not None:
        wm = np.sort(np.asarray(waveglow_measures))
        log('avg', {"waveglow_samples/s": all_samples / wm.sum()})
        log('avg', {"waveglow_latency": wm.mean()})
        log('90%', {
            "waveglow_latency":
            wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()
        })
        log('95%', {
            "waveglow_latency":
            wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()
        })
        log('99%', {
            "waveglow_latency":
            wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()
        })
    if generator is not None and waveglow is not None:
        m = gm + wm
        rtf = all_samples / (len(batches) * all_utterances * m.mean() *
                             args.sampling_rate)
        log('avg', {"samples/s": all_samples / m.sum()})
        log('avg', {"letters/s": all_letters / m.sum()})
        log('avg', {"latency": m.mean()})
        log('avg', {"RTF": rtf})
        log('90%',
            {"latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()})
        log('95%',
            {"latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()})
        log('99%',
            {"latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()})
    DLLogger.flush()
sys.path.append(sys.argv[1])

# must be imported after path is modified
from import_utils import load_waveglow
from waveglow.denoiser import Denoiser

strength = 0.1
if len(sys.argv) == 5:
    strength = float(sys.argv[4])

print("Building denoiser")

waveglow = load_waveglow(sys.argv[2], WAVEGLOW_CONFIG)

denoiser = Denoiser(waveglow).cuda()

statedict = {}

statedict["denoiser.stft.forward_basis"] = denoiser.stft.forward_basis.cpu(
).numpy().tolist()
statedict["denoiser.stft.inverse_basis"] = denoiser.stft.inverse_basis.cpu(
).numpy().tolist()
statedict["denoiser.stft.win_sq"] = gen_win_sq(denoiser).tolist()
statedict["denoiser.bias_spec"] = (denoiser.bias_spec *
                                   strength).cpu().numpy().tolist()

with open(json_path, "w") as fout:
    json.dump(statedict, fout, indent=2)

print("Wrote to '%s'" % json_path)
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])
    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    measurements_all = {"pre_processing": [],
                        "tacotron2_latency": [],
                        "waveglow_latency": [],
                        "latency": [],
                        "type_conversion": [],
                        "data_transfer": [],
                        "storage": [],
                        "tacotron2_items_per_sec": [],
                        "waveglow_items_per_sec": [],
                        "num_mels_per_audio": [],
                        "throughput": []}

    print("args:", args, unknown_args)

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run)

    if args.cpu_run:
        denoiser = Denoiser(waveglow, args.cpu_run)
    else:
        denoiser = Denoiser(waveglow, args.cpu_run).cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
    texts = [texts[0][:args.input_length]]
    texts = texts*args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing", args.cpu_run):
            sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run)

        with torch.no_grad():
            with MeasureTime(measurements, "latency", args.cpu_run):
                with MeasureTime(measurements, "tacotron2_latency", args.cpu_run):
                    mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths)

                with MeasureTime(measurements, "waveglow_latency", args.cpu_run):
                    audios = waveglow.infer(mel, sigma=args.sigma_infer)
                    audios = audios.float()
                    audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

        num_mels = mel.size(0)*mel.size(2)
        num_samples = audios.size(0)*audios.size(1)

        with MeasureTime(measurements, "type_conversion", args.cpu_run):
            audios = audios.float()

        with MeasureTime(measurements, "data_transfer", args.cpu_run):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage", args.cpu_run):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_"+str(i)+".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i]*args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples/measurements['latency']

        if iter >= warmup_iters:
            for k,v in measurements.items():
                measurements_all[k].append(v)
                DLLogger.log(step=(iter-warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
Beispiel #23
0
def agumentation(arpabet_dict,
                 audio_paths,
                 target_spk_id_list,
                 output_path,
                 ljs=False):

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    # Step1: Basic Setups

    if not ljs:
        # Whether to use lj speech
        checkpoint_path = "mellotron_libritts.pt"
    else:
        checkpoit_path = "mellotron_ljs.pt"
    if torch.cuda.is_available():
        tacotron = load_model(hparams).cuda().eval()
    else:
        tacotron = load_model(hparams).eval()
    tacotron.load_state_dict(
        torch.load(checkpoint_path, map_location="cpu")['state_dict'])

    waveglow_path = 'waveglow_256channels_v4.pt'
    if torch.cuda.is_available():
        waveglow = torch.load(waveglow_path)['model'].cuda().eval()
        denoiser = Denoiser(waveglow).cuda().eval()
    else:
        waveglow = torch.load(waveglow_path,
                              map_location="cpu")['model'].eval().cpu()
        denoiser = Denoiser(waveglow).eval()

    arpabet_dict = cmudict.CMUDict(arpabet_dict)
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(1)

    # Step2: Load
    for file_idx in range(len(dataloader)):
        source_scp = open(os.path.join(output_path, "source.scp"),
                          "w",
                          encoding="utf-8")

        audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]
        source_scp.write("{} {}\n".format(file_idx, audio_path))

        # get audio path, encoded text, pitch contour and mel for gst
        text_encoded = torch.LongTensor(
            text_to_sequence(text, hparams.text_cleaners,
                             arpabet_dict))[None, :]
        pitch_contour = dataloader[file_idx][3][None]
        if torch.cuda.is_available():
            text_encoded = text_encoded.cuda()
            pitch_contour = pitch_contour.cuda()
        mel = load_mel(audio_path)
        # load source data to obtain rhythm using tacotron 2 as a forced aligner
        x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]]))

        # Step3: Perform speaker transfer
        with torch.no_grad():
            # get rhythm (alignment map) using tacotron 2
            mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward(
                x)
            rhythm = rhythm.permute(1, 0, 2)

        for spk_id in target_spk_id_list:
            speaker_id = torch.LongTensor([spk_id])

            if torch.cuda.is_available():
                speaker_id = speaker_id.cuda()

            with torch.no_grad():
                mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention(
                    (text_encoded, mel, speaker_id, pitch_contour * 0.4,
                     rhythm))

            with torch.no_grad():
                audio = denoiser(
                    waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

            sf.write(
                os.path.join(output_path, "{}-{}.wav".format(file_idx,
                                                             spk_id)),
                audio.detach().cpu().numpy().T, hparams.sampling_rate)
Beispiel #24
0
def main():

    parser = argparse.ArgumentParser(
        description='TensorRT Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    # initialize CUDA state
    torch.cuda.init()

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    encoder = load_engine(args.encoder, TRT_LOGGER)
    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
    postnet = load_engine(args.postnet, TRT_LOGGER)
    waveglow = load_engine(args.waveglow, TRT_LOGGER)

    if args.waveglow_ckpt != "":
        # setup denoiser using WaveGlow PyTorch checkpoint
        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
                                             True, forward_is_infer=True)
        denoiser = Denoiser(waveglow_ckpt).cuda()
        # after initialization, we don't need WaveGlow PyTorch checkpoint
        # anymore - deleting
        del waveglow_ckpt
        torch.cuda.empty_cache()

    # create TRT contexts for each engine
    encoder_context = encoder.create_execution_context()
    decoder_context = decoder_iter.create_execution_context()
    postnet_context = postnet.create_execution_context()
    waveglow_context = waveglow.create_execution_context()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
                                              args.output+'/'+args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    measurements = {}

    sequences, sequence_lengths = prepare_input_sequence(texts)
    sequences = sequences.to(torch.int32)
    sequence_lengths = sequence_lengths.to(torch.int32)
    with MeasureTime(measurements, "latency"):
        mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
                                               encoder_context, decoder_context, postnet_context,
                                               sequences, sequence_lengths, measurements, args.fp16)
        audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)

    with encoder_context, decoder_context,  postnet_context, waveglow_context:
        pass

    audios = audios.float()
    if args.waveglow_ckpt != "":
        with MeasureTime(measurements, "denoiser"):
            audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i]*args.stft_hop_length]
        audio = audio/torch.max(torch.abs(audio))
        audio_path = args.output + "audio_"+str(i)+"_trt.wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())


    DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
    DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
    DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0, data={"latency": measurements['latency']})

    if args.waveglow_ckpt != "":
        DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
    DLLogger.flush()

    prec = "fp16" if args.fp16 else "fp32"
    latency = measurements['latency']
    throughput = audios.size(1)/latency
    log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n"
    with open("log_bs1_"+prec+".log", 'a') as f:
        f.write(log_data)
Beispiel #25
0
def measure(output_directory, log_directory, checkpoint_path, warm_start,
            n_gpus, rank, group_name, hparams):
    """Handles all the validation scoring and printing"""
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)

    mellotron = load_model(hparams).cuda().eval()
    mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    waveglow_path = '/media/arsh/New Volume/Models/speech/waveglow_256channels_v4.pt'
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    denoiser = Denoiser(waveglow).cuda().eval()

    arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
    audio_paths = 'filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt'
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(1)

    speaker_ids = TextMelLoader(
        "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt",
        hparams).speaker_ids
    speakers = pd.read_csv('filelists/libritts_speakerinfo.txt',
                           engine='python',
                           header=None,
                           comment=';',
                           sep=' *\| *',
                           names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
    speakers['MELLOTRON_ID'] = speakers['ID'].apply(
        lambda x: speaker_ids[x] if x in speaker_ids else -1)
    female_speakers = cycle(
        speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())
    male_speakers = cycle(
        speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())

    file_idx = 0
    MEL_DTW = []
    TPP_DTW = []
    RAND_DTW = []
    logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
    while file_idx < len(dataloader):
        audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

        # get audio path, encoded text, pitch contour and mel for gst
        text_encoded = torch.LongTensor(
            text_to_sequence(text, hparams.text_cleaners,
                             arpabet_dict))[None, :].cuda()
        pitch_contour = dataloader[file_idx][3][None].cuda()
        mel = load_mel(audio_path, stft)
        fs, audio = read(audio_path)

        # load source data to obtain rhythm using tacotron 2 as a forced aligner
        x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

        with torch.no_grad():
            # get rhythm (alignment map) using tacotron 2
            mel_outputs, mel_outputs_postnet, gate_outputs, rhythm, gst, tpse_gst = mellotron.forward(
                x)
            rhythm = rhythm.permute(1, 0, 2)
        speaker_id = next(female_speakers) if np.random.randint(2) else next(
            male_speakers)
        speaker_id = torch.LongTensor([speaker_id]).cuda()

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour, rhythm),
                with_tpse=False)
        with torch.no_grad():
            audio_mel = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour, rhythm),
                with_tpse=True)
        with torch.no_grad():
            audio_tpp = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, np.random.randint(
                    0, 9), speaker_id, pitch_contour, rhythm),
                with_tpse=False)
        with torch.no_grad():
            audio_rand = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
        audio = np.pad(audio, 128)

        MEL_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_mel.data.cpu().numpy(), audio, eucCepDist)[0]))
        TPP_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_tpp.data.cpu().numpy(), audio, eucCepDist)[0]))
        RAND_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_rand.data.cpu().numpy(), audio, eucCepDist)[0]))
        print(MEL_DTW[-1], TPP_DTW[-1], RAND_DTW[-1])
        print("MEL DTW, Mean: ", np.mean(MEL_DTW), " SD: ", np.std(MEL_DTW))
        print("TPP DTW, Mean: ", np.mean(TPP_DTW), " SD: ", np.std(TPP_DTW))
        print("RAND DTW, Mean: ", np.mean(RAND_DTW), " SD: ", np.std(RAND_DTW))
        file_idx += 1
Beispiel #26
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_items_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("waveglow_items_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("waveglow_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    log_hardware()
    log_args(args)

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     args.amp_run)
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.amp_run)
    denoiser = Denoiser(waveglow).cuda()

    tacotron2.forward = tacotron2.infer
    type(tacotron2).forward = type(tacotron2).infer
    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    if args.include_warmup:
        sequence = torch.randint(low=0,
                                 high=148,
                                 size=(1, 50),
                                 dtype=torch.long).cuda()
        input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
                _ = waveglow.infer(mel)

    LOGGER.iteration_start()

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
        audios = waveglow.infer(mel, sigma=args.sigma_infer)
        audios = audios.float()
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    tacotron2_infer_perf = mel.size(0) * mel.size(
        2) / measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0) * audios.size(
        1) / measurements['waveglow_time']

    LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)
    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
    LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time'])
    LOGGER.log(key="latency",
               value=(measurements['tacotron2_time'] +
                      measurements['waveglow_time']))

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i] * args.stft_hop_length]
        audio = audio / torch.max(torch.abs(audio))
        audio_path = args.output + "audio_" + str(i) + ".wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    LOGGER.iteration_stop()
    LOGGER.finish()
Beispiel #27
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = os.path.join(args.output, args.log_file)
    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
                            StdOutBackend(Verbosity.VERBOSE)])
    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     args.fp16, args.cpu, forward_is_infer=True) # forward is infer를 해줌으로써 tacotron model의 infer로 간다.
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.fp16, args.cpu, forward_is_infer=True)
    denoiser = Denoiser(waveglow)
    if not args.cpu:
        denoiser.cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    id_list = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    #-------------------------------------------------------------------------------------------------------------------
    ref_mel = load_mel(args.ref_mel)
    id_list.append(args.emotion_id)
    emotion_id = torch.LongTensor(id_list).cuda()
    print(emotion_id)
    #-------------------------------------------------------------------------------------------------------------------


    if args.include_warmup:
        sequence = torch.randint(low=0, high=80, size=(1,50)).long()
        input_lengths = torch.IntTensor([sequence.size(1)]).long()
        if not args.cpu:
            sequence = sequence.cuda()
            input_lengths = input_lengths.cuda()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths, ref_mel, emotion_id)
                _ = waveglow(mel)

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu):
        mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths, ref_mel, emotion_id)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu):
        audios = waveglow(mel, sigma=args.sigma_infer)
        audios = audios.float()
    with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu):
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    print("Stopping after",mel.size(2),"decoder steps")
    tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']

    DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
    DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']})
    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']})
    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])})

    for i, audio in enumerate(audios):

        plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower")
        figure_path = os.path.join(args.output,"alignment_"+str(i)+args.suffix+".png")
        plt.savefig(figure_path)

        audio = audio[:mel_lengths[i]*args.stft_hop_length]
        audio = audio/torch.max(torch.abs(audio))
        audio_path = os.path.join(args.output,"audio_"+str(i)+args.suffix+".wav")
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
Beispiel #28
0
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

hparams = create_hparams()
hparams.batch_size = 1
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                    hparams.win_length, hparams.n_mel_channels,
                    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax)
speaker = "nes"
checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000'
model = initiate_model(hparams).cuda().eval()
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
test_text_path = 'filelists/emotion/neutral2.txt'
test_set = TextMelLoader(test_text_path, hparams)
datacollate = TextMelCollate(1)
dataloader = DataLoader(test_set,
                        num_workers=1,
                        shuffle=False,
                        batch_size=1,
                        pin_memory=False,
                        drop_last=False,
                        collate_fn=datacollate)
speaker_ids = TextMelLoader(hparams.training_files, hparams).speaker_ids
speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda()

pytorch_total_params = sum(p.numel() for p in model.parameters())
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    # LOGGER.set_model_name("Tacotron2_PyT")
    # LOGGER.set_backends([
    #     dllg.StdOutBackend(log_file=None,
    #                        logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
    #     dllg.JsonBackend(log_file=args.log_file,
    #                      logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
    # ])
    # LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    # LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
    # LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    # LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
    # LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    # log_hardware()
    # log_args(args)

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     args.amp_run)

    waveglow = torch.load(args.waveglow)['model']
    # waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
    #                                 args.amp_run)
    denoiser = Denoiser(waveglow).cuda()

    data_loader = torch.utils.data.DataLoader(dataloader(args), 5, shuffle=False, collate_fn = collate_fn)  

    measurements = {}
    img_num = 0
    k = 0
    for i, data in enumerate(data_loader):
        try: 
            new_num = math.ceil((i+1)/2)             
            sequences_padded, input_lengths, keys = data
            if torch.cuda.is_available():
                sequences_padded = torch.autograd.Variable(sequences_padded).cuda().long()
                input_lengths = torch.autograd.Variable(input_lengths).cuda().long()
            else:
                sequences_padded = torch.autograd.Variable(sequences_padded).long()
                input_lengths = torch.autograd.Variable(input_lengths).long()



            with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
                _, mel, _, _, mel_lengths = tacotron2.infer(sequences_padded, input_lengths)

            with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
                audios = waveglow.infer(mel, sigma=args.sigma_infer)
                audios = audios.float()
                audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)




            # tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
            # waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']

            # LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)
            # LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
            # LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
            # LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time'])
            # LOGGER.log(key="latency", value=(measurements['tacotron2_time']+
            #                                  measurements['waveglow_time']))

            for j, audio in enumerate(audios):
                k+=1
                key = keys[j]
                audio = audio[:mel_lengths[j]*args.stft_hop_length]
                audio = audio/torch.max(torch.abs(audio))
                # audio_path = args.output + "/audio_"+str(j)+'-'+str(i)+".wav"
                audio_dir = args.output
                audio_path = str(key) + '.wav'         
                save_path = os.path.join(audio_dir,audio_path)
                write(save_path, args.sampling_rate, audio.cpu().numpy())

                info = 'saved the %i-th audios\n'%(k)       
              
        except:
            pass