def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) measurements_all = { "pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "denoiser_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": [] } print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() texts = [ "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." ] texts = [texts[0][:args.input_length]] texts = texts * args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu): sequences_padded, input_lengths = prepare_input_sequence( texts, args.cpu) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu): with MeasureTime(measurements, "tacotron2_latency", args.cpu): mel, mel_lengths, _ = tacotron2.infer( sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu): audios = waveglow.infer(mel, sigma=args.sigma_infer) num_mels = mel.size(0) * mel.size(2) num_samples = audios.size(0) * audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu): audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu): audios = denoiser( audios, strength=args.denoising_strength).squeeze(1) with MeasureTime(measurements, "data_transfer", args.cpu): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i] * args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels / measurements[ 'tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples / measurements[ 'waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples / measurements['latency'] if iter >= warmup_iters: for k, v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter - warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50)).long() input_lengths = torch.IntTensor([sequence.size(1)]).long() if not args.cpu: sequence = sequence.cuda() input_lengths = input_lengths.cuda() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu): mel, mel_lengths, alignments = jitted_tacotron2( sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time'] + measurements['denoiser_time']) }) for i, audio in enumerate(audios): plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") figure_path = args.output + "alignment_" + str( i) + "_" + args.suffix + ".png" plt.savefig(figure_path) audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str( i) + "_" + args.suffix + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() use_custom_naming = args.custom_name input_path = args.input text_cleaners = args.text_cleaners check_directory_and_create(args.output, exists_warning=True) # import pdb; pdb.set_trace() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) if args.use_extracted_mels: print(f"mel found in {args.mel_path}") mel = torch.load(args.mel_path) mel = mel.unsqueeze(0) print(f"The size of the mel we just loaded is {mel.shape}") audios = apply_griffin_lim(args, mel) else: tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) if not args.use_griffin_lim: waveglow = \ load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup and (not args.use_griffin_lim): sequence = torch.randint(low=0, high=148, size=(1, 50)).long() input_lengths = torch.IntTensor([sequence.size(1)]).long() if not args.cpu: sequence = sequence.cuda() input_lengths = input_lengths.cuda() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2( sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = \ prepare_input_sequence(texts, args.cpu, text_cleaners) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu): mel, mel_lengths, alignments = jitted_tacotron2( sequences_padded, input_lengths) if args.use_griffin_lim: print(f"The size of the generated mel spec is {mel.shape}") audios = apply_griffin_lim(args, mel) # import pdb; pdb.set_trace() # audios = audios.cpu().numpy() #audio = audio.astype('int16') # audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename)) # write(audio_path, hparams.sampling_rate, audio) # print(audio_path) else: with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log( step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log( step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log( step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log( step=0, data={"denoiser_latency": measurements['denoiser_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time'] + measurements['denoiser_time']) }) for i, audio in enumerate(audios): if use_custom_naming: if args.use_extracted_mels: custom_name = (args.mel_path.split("/")[-1]).split(".")[0] else: custom_name = (input_path.split("/")[-1]).split(".")[0] custom_path = os.path.join(args.output, custom_name) if not args.use_extracted_mels: # save alignment import pdb pdb.set_trace() plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") figure_path = custom_path + "_alignment.png" plt.savefig(figure_path) meltitle = "_predicted" else: meltitle = "_extracetd" # save predicted mel # import pdb; pdb.set_trace() plot_mel_spectrogram( mel, title=meltitle, dirname=custom_path, append_name=True, load_mel_path=False, # load_mel_path=True ) # save generated audio # if not args.use_griffin_lim: if not args.use_extracted_mels: audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) # custom_name = (input_path.split("/")[-1]).split(".")[0] audio_path = custom_path + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) else: plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") # figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png" figure_path = "alignment_" + str(i) + "_" + args.suffix + ".png" # import pdb; pdb.set_trace() figure_path = os.path.join(args.output, figure_path) plt.savefig(figure_path) audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = \ os.path.join(args.output, "audio_"+str(i)+"_"+args.suffix+".wav") write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()