Esempio n. 1
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    measurements_all = {
        "pre_processing": [],
        "tacotron2_latency": [],
        "waveglow_latency": [],
        "denoiser_latency": [],
        "latency": [],
        "type_conversion": [],
        "data_transfer": [],
        "storage": [],
        "tacotron2_items_per_sec": [],
        "waveglow_items_per_sec": [],
        "num_mels_per_audio": [],
        "throughput": []
    }

    print("args:", args, unknown_args)

    tacotron2 = load_and_setup_model('Tacotron2',
                                     parser,
                                     args.tacotron2,
                                     args.fp16,
                                     args.cpu,
                                     forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.fp16,
                                    args.cpu,
                                    forward_is_infer=True)
    denoiser = Denoiser(waveglow)
    if not args.cpu:
        denoiser.cuda()

    texts = [
        "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."
    ]
    texts = [texts[0][:args.input_length]]
    texts = texts * args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing", args.cpu):
            sequences_padded, input_lengths = prepare_input_sequence(
                texts, args.cpu)

        with torch.no_grad():
            with MeasureTime(measurements, "latency", args.cpu):
                with MeasureTime(measurements, "tacotron2_latency", args.cpu):
                    mel, mel_lengths, _ = tacotron2.infer(
                        sequences_padded, input_lengths)

                with MeasureTime(measurements, "waveglow_latency", args.cpu):
                    audios = waveglow.infer(mel, sigma=args.sigma_infer)

                num_mels = mel.size(0) * mel.size(2)
                num_samples = audios.size(0) * audios.size(1)

                with MeasureTime(measurements, "type_conversion", args.cpu):
                    audios = audios.float()

                with torch.no_grad(), MeasureTime(measurements,
                                                  "denoiser_latency",
                                                  args.cpu):
                    audios = denoiser(
                        audios, strength=args.denoising_strength).squeeze(1)

        with MeasureTime(measurements, "data_transfer", args.cpu):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage", args.cpu):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_" + str(i) + ".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i] * args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels / measurements[
            'tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples / measurements[
            'waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples / measurements['latency']

        if iter >= warmup_iters:
            for k, v in measurements.items():
                measurements_all[k].append(v)
                DLLogger.log(step=(iter - warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
Esempio n. 2
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    tacotron2 = load_and_setup_model('Tacotron2',
                                     parser,
                                     args.tacotron2,
                                     args.fp16,
                                     args.cpu,
                                     forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.fp16,
                                    args.cpu,
                                    forward_is_infer=True)
    denoiser = Denoiser(waveglow)
    if not args.cpu:
        denoiser.cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    if args.include_warmup:
        sequence = torch.randint(low=0, high=148, size=(1, 50)).long()
        input_lengths = torch.IntTensor([sequence.size(1)]).long()
        if not args.cpu:
            sequence = sequence.cuda()
            input_lengths = input_lengths.cuda()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths)
                _ = waveglow(mel)

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time",
                                      args.cpu):
        mel, mel_lengths, alignments = jitted_tacotron2(
            sequences_padded, input_lengths)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu):
        audios = waveglow(mel, sigma=args.sigma_infer)
        audios = audios.float()
    with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu):
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    print("Stopping after", mel.size(2), "decoder steps")
    tacotron2_infer_perf = mel.size(0) * mel.size(
        2) / measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0) * audios.size(
        1) / measurements['waveglow_time']

    DLLogger.log(step=0,
                 data={"tacotron2_items_per_sec": tacotron2_infer_perf})
    DLLogger.log(step=0,
                 data={"tacotron2_latency": measurements['tacotron2_time']})
    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
    DLLogger.log(step=0,
                 data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0,
                 data={"denoiser_latency": measurements['denoiser_time']})
    DLLogger.log(step=0,
                 data={
                     "latency": (measurements['tacotron2_time'] +
                                 measurements['waveglow_time'] +
                                 measurements['denoiser_time'])
                 })

    for i, audio in enumerate(audios):

        plt.imshow(alignments[i].float().data.cpu().numpy().T,
                   aspect="auto",
                   origin="lower")
        figure_path = args.output + "alignment_" + str(
            i) + "_" + args.suffix + ".png"
        plt.savefig(figure_path)

        audio = audio[:mel_lengths[i] * args.stft_hop_length]
        audio = audio / torch.max(torch.abs(audio))
        audio_path = args.output + "audio_" + str(
            i) + "_" + args.suffix + ".wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
Esempio n. 3
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()
    use_custom_naming = args.custom_name
    input_path = args.input
    text_cleaners = args.text_cleaners

    check_directory_and_create(args.output, exists_warning=True)

    # import pdb; pdb.set_trace()
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    if args.use_extracted_mels:
        print(f"mel found in {args.mel_path}")
        mel = torch.load(args.mel_path)
        mel = mel.unsqueeze(0)
        print(f"The size of the mel we just loaded is {mel.shape}")
        audios = apply_griffin_lim(args, mel)
    else:
        tacotron2 = load_and_setup_model('Tacotron2',
                                         parser,
                                         args.tacotron2,
                                         args.fp16,
                                         args.cpu,
                                         forward_is_infer=True)

        if not args.use_griffin_lim:
            waveglow = \
                load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.fp16, args.cpu, forward_is_infer=True)
            denoiser = Denoiser(waveglow)
            if not args.cpu:
                denoiser.cuda()

        jitted_tacotron2 = torch.jit.script(tacotron2)

        texts = []
        try:
            f = open(args.input, 'r')
            texts = f.readlines()
        except:
            print("Could not read file")
            sys.exit(1)

        if args.include_warmup and (not args.use_griffin_lim):
            sequence = torch.randint(low=0, high=148, size=(1, 50)).long()
            input_lengths = torch.IntTensor([sequence.size(1)]).long()
            if not args.cpu:
                sequence = sequence.cuda()
                input_lengths = input_lengths.cuda()
            for i in range(3):
                with torch.no_grad():
                    mel, mel_lengths, _ = jitted_tacotron2(
                        sequence, input_lengths)
                    _ = waveglow(mel)

        measurements = {}

        sequences_padded, input_lengths = \
            prepare_input_sequence(texts, args.cpu, text_cleaners)

        with torch.no_grad(), MeasureTime(measurements, "tacotron2_time",
                                          args.cpu):
            mel, mel_lengths, alignments = jitted_tacotron2(
                sequences_padded, input_lengths)

        if args.use_griffin_lim:
            print(f"The size of the generated mel spec is {mel.shape}")
            audios = apply_griffin_lim(args, mel)
            # import pdb; pdb.set_trace()
            # audios = audios.cpu().numpy()
            #audio = audio.astype('int16')
            # audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename))
            # write(audio_path, hparams.sampling_rate, audio)
            # print(audio_path)
        else:
            with torch.no_grad(), MeasureTime(measurements, "waveglow_time",
                                              args.cpu):
                audios = waveglow(mel, sigma=args.sigma_infer)
                audios = audios.float()
            with torch.no_grad(), MeasureTime(measurements, "denoiser_time",
                                              args.cpu):
                audios = denoiser(audios,
                                  strength=args.denoising_strength).squeeze(1)

            print("Stopping after", mel.size(2), "decoder steps")

            tacotron2_infer_perf = mel.size(0) * mel.size(
                2) / measurements['tacotron2_time']
            waveglow_infer_perf = audios.size(0) * audios.size(
                1) / measurements['waveglow_time']

            DLLogger.log(
                step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
            DLLogger.log(
                step=0,
                data={"tacotron2_latency": measurements['tacotron2_time']})
            DLLogger.log(step=0,
                         data={"waveglow_items_per_sec": waveglow_infer_perf})
            DLLogger.log(
                step=0,
                data={"waveglow_latency": measurements['waveglow_time']})
            DLLogger.log(
                step=0,
                data={"denoiser_latency": measurements['denoiser_time']})
            DLLogger.log(step=0,
                         data={
                             "latency": (measurements['tacotron2_time'] +
                                         measurements['waveglow_time'] +
                                         measurements['denoiser_time'])
                         })

    for i, audio in enumerate(audios):
        if use_custom_naming:
            if args.use_extracted_mels:
                custom_name = (args.mel_path.split("/")[-1]).split(".")[0]
            else:
                custom_name = (input_path.split("/")[-1]).split(".")[0]
            custom_path = os.path.join(args.output, custom_name)
            if not args.use_extracted_mels:
                # save alignment
                import pdb
                pdb.set_trace()
                plt.imshow(alignments[i].float().data.cpu().numpy().T,
                           aspect="auto",
                           origin="lower")
                figure_path = custom_path + "_alignment.png"
                plt.savefig(figure_path)
                meltitle = "_predicted"
            else:
                meltitle = "_extracetd"
                # save predicted mel
            # import pdb; pdb.set_trace()
            plot_mel_spectrogram(
                mel,
                title=meltitle,
                dirname=custom_path,
                append_name=True,
                load_mel_path=False,
                # load_mel_path=True
            )
            # save generated audio
            # if not args.use_griffin_lim:
            if not args.use_extracted_mels:
                audio = audio[:mel_lengths[i] * args.stft_hop_length]
            audio = audio / torch.max(torch.abs(audio))
            # custom_name = (input_path.split("/")[-1]).split(".")[0]
            audio_path = custom_path + ".wav"
            write(audio_path, args.sampling_rate, audio.cpu().numpy())
        else:
            plt.imshow(alignments[i].float().data.cpu().numpy().T,
                       aspect="auto",
                       origin="lower")
            # figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png"
            figure_path = "alignment_" + str(i) + "_" + args.suffix + ".png"
            # import pdb; pdb.set_trace()
            figure_path = os.path.join(args.output, figure_path)
            plt.savefig(figure_path)
            audio = audio[:mel_lengths[i] * args.stft_hop_length]
            audio = audio / torch.max(torch.abs(audio))
            audio_path = \
                os.path.join(args.output, "audio_"+str(i)+"_"+args.suffix+".wav")
            write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()