def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, fp16): mel_size = mel.size(2) batch_size = mel.size(0) stride = 256 n_group = 8 z_size = mel_size*stride z_size = z_size//n_group z = torch.randn(batch_size, n_group, z_size).cuda() audios = torch.zeros(batch_size, mel_size*stride).cuda() if fp16: z = z.half() mel = mel.half() audios = audios.half() waveglow_tensors = { "inputs" : {'mel': mel, 'z': z}, "outputs" : {'audio': audios} } print("Running WaveGlow") with MeasureTime(measurements, "waveglow_time"): run_trt_engine(waveglow_context, waveglow, waveglow_tensors) return audios
def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, fp16): mel = mel.unsqueeze(3) mel_size = mel.size(2) batch_size = mel.size(0) stride = 256 kernel_size = 1024 n_group = 8 z_size = (mel_size-1)*stride+(kernel_size-1)+1 z_size = z_size - (kernel_size-stride) z_size = z_size//n_group z = torch.randn(batch_size, n_group, z_size, 1).cuda() audios = torch.zeros(batch_size, mel_size*stride).cuda() if fp16: z = z.half() mel = mel.half() audios = audios.half() waveglow_tensors = { # inputs 'mel': mel, 'z': z, # outputs 'audio': audios } print("Running WaveGlow") with MeasureTime(measurements, "waveglow_time"): run_trt_engine(waveglow_context, waveglow, waveglow_tensors) return audios
def infer_waveglow_onnx(waveglow_path, mel, measurements, fp16): import onnx import onnxruntime sess = onnxruntime.InferenceSession(waveglow_path) device = mel.device mel_size = mel.size(2) batch_size = mel.size(0) stride = 256 n_group = 8 z_size = mel_size * stride z_size = z_size // n_group z = torch.randn(batch_size, n_group, z_size).cuda() mel = mel.unsqueeze(3) z = z.unsqueeze(3) if fp16: z = z.half() mel = mel.half() mel = mel.cpu().numpy().copy() z = z.cpu().numpy().copy() print("Running WaveGlow with ONNX Runtime") with MeasureTime(measurements, "waveglow_time"): result = sess.run(["audio"], {'mel': mel, 'z': z}) audios = torch.tensor(result[0], device=device) return audios
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch FastPitch Inference Benchmark') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'}) model = load_and_setup_model('FastPitch', parser, None, args.amp_run, 'cuda', unk_args=[], forward_is_infer=True, ema=False, jitable=True) # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03 # model = torch.jit.script(model) warmup_iters = 3 iters = 1 gen_measures = MeasureTime() all_frames = 0 for i in range(-warmup_iters, iters): text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 128), dtype=torch.long).to('cuda') input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).to('cuda') durs = torch.ones_like(text_padded).mul_(4).to('cuda') with torch.no_grad(), gen_measures: mels, *_ = model(text_padded, input_lengths, dur_tgt=durs) num_frames = mels.size(0) * mels.size(2) if i >= 0: all_frames += num_frames DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]}) DLLogger.log(step=(i, ), data={"frames/s": num_frames / gen_measures[-1]}) measures = gen_measures[warmup_iters:] DLLogger.log(step=(), data={'avg latency': np.mean(measures)}) DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)}) DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='TensorRT Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # initialize CUDA state torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, True, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.output+'/'+args.log_file), StdOutBackend(Verbosity.VERBOSE)]) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) measurements = {} sequences, sequence_lengths = prepare_input_sequence(texts) sequences = sequences.to(torch.int32) sequence_lengths = sequence_lengths.to(torch.int32) with MeasureTime(measurements, "latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, args.fp16) audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) with encoder_context, decoder_context, postnet_context, waveglow_context: pass audios = audios.float() if args.waveglow_ckpt != "": with MeasureTime(measurements, "denoiser"): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = args.output + "audio_"+str(i)+"_trt.wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']}) DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']}) DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"latency": measurements['latency']}) if args.waveglow_ckpt != "": DLLogger.log(step=0, data={"denoiser": measurements['denoiser']}) DLLogger.flush() prec = "fp16" if args.fp16 else "fp32" latency = measurements['latency'] throughput = audios.size(1)/latency log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n" with open("log_bs1_"+prec+".log", 'a') as f: f.write(log_data)
def infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, fp16): memory = torch.zeros((len(sequence_lengths), sequence_lengths[0], 512)).cuda() if fp16: memory = memory.half() device = memory.device dtype = memory.dtype processed_memory = torch.zeros((len(sequence_lengths),sequence_lengths[0],128), device=device, dtype=dtype) lens = torch.zeros_like(sequence_lengths) encoder_tensors = { # inputs 'sequences': sequences, 'sequence_lengths': sequence_lengths, # outputs 'memory': memory, 'lens': lens, 'processed_memory': processed_memory } print("Running Tacotron2 Encoder") with MeasureTime(measurements, "tacotron2_encoder_time"): run_trt_engine(encoder_context, encoder, encoder_tensors) device = memory.device mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device) not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device) mel_outputs, gate_outputs, alignments = (torch.zeros(1, device = device), torch.zeros(1, device = device), torch.zeros(1, device = device)) gate_threshold = 0.5 max_decoder_steps = 1664 first_iter = True decoder_inputs = init_decoder_inputs(memory, processed_memory, sequence_lengths) decoder_outputs = init_decoder_outputs(memory, sequence_lengths) print("Running Tacotron2 Decoder") while True: decoder_tensors = init_decoder_tensors(decoder_inputs, decoder_outputs) with MeasureTime(measurements, "step"): run_trt_engine(decoder_context, decoder_iter, decoder_tensors) if first_iter: mel_outputs = torch.unsqueeze(decoder_outputs[7], 2) gate_outputs = torch.unsqueeze(decoder_outputs[8], 2) alignments = torch.unsqueeze(decoder_outputs[4], 2) measurements['tacotron2_decoder_time'] = measurements['step'] first_iter = False else: mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(decoder_outputs[7], 2)), 2) gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(decoder_outputs[8], 2)), 2) alignments = torch.cat((alignments, torch.unsqueeze(decoder_outputs[4], 2)), 2) measurements['tacotron2_decoder_time'] += measurements['step'] dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1) not_finished = not_finished*dec mel_lengths += not_finished if torch.sum(not_finished) == 0: print("Stopping after",mel_outputs.size(2),"decoder steps") break if mel_outputs.size(2) == max_decoder_steps: print("Warning! Reached max decoder steps") break decoder_inputs, decoder_outputs = swap_inputs_outputs(decoder_inputs, decoder_outputs) mel_outputs_postnet = torch.zeros_like(mel_outputs, device=device, dtype=dtype) postnet_tensors = { # inputs 'mel_outputs': mel_outputs, # outputs 'mel_outputs_postnet': mel_outputs_postnet } print("Running Tacotron2 Postnet") with MeasureTime(measurements, "tacotron2_postnet_time"): run_trt_engine(postnet_context, postnet, postnet_tensors) print("Tacotron2 Postnet done") return mel_outputs_postnet, mel_lengths
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) measurements_all = {"pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": []} print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."] texts = [texts[0][:args.input_length]] texts = texts*args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu_run): sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu_run): with MeasureTime(measurements, "tacotron2_latency", args.cpu_run): mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu_run): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) num_mels = mel.size(0)*mel.size(2) num_samples = audios.size(0)*audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu_run): audios = audios.float() with MeasureTime(measurements, "data_transfer", args.cpu_run): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu_run): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i]*args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples/measurements['latency'] if iter >= warmup_iters: for k,v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter-warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) model = load_and_setup_model(args.model_name, parser, None, args.amp_run, forward_is_infer=True) if args.model_name == "Tacotron2": model = torch.jit.script(model) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): measurements = {} if args.model_name == 'Tacotron2': text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140), dtype=torch.long).cuda() input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).cuda().long() with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _, _ = model(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels num_mels = 895 mel_padded = torch.zeros(args.batch_size, n_mel_channels, num_mels).normal_(-5.62, 1.98).cuda() if args.amp_run: mel_padded = mel_padded.half() with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: DLLogger.log(step=(i - warmup_iters, ), data={"latency": measurements['inference_time']}) DLLogger.log(step=(i - warmup_iters, ), data={ "items_per_sec": num_items / measurements['inference_time'] }) DLLogger.log(step=tuple(), data={'infer_latency': measurements['inference_time']}) DLLogger.log(step=tuple(), data={ 'infer_items_per_sec': num_items / measurements['inference_time'] }) DLLogger.flush()
def infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, fp16, loop): batch_size = len(sequence_lengths) max_sequence_len = sequence_lengths[0] memory = torch.zeros((batch_size, max_sequence_len, 512)).cuda() if fp16: memory = memory.half() device = memory.device dtype = memory.dtype processed_memory = torch.zeros((batch_size, max_sequence_len, 128), device=device, dtype=dtype) lens = torch.zeros_like(sequence_lengths) print(f"batch_size: {batch_size}, max sequence length: {max_sequence_len}") encoder_tensors = { "inputs": { 'sequences': sequences, 'sequence_lengths': sequence_lengths }, "outputs": { 'memory': memory, 'lens': lens, 'processed_memory': processed_memory } } print("Running Tacotron2 Encoder") with MeasureTime(measurements, "tacotron2_encoder_time"): run_trt_engine(encoder_context, encoder, encoder_tensors) max_decoder_steps = 1024 device = memory.device mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device=device) not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device=device) mel_outputs = torch.ones((batch_size, 80, max_decoder_steps), device=device, dtype=dtype).cuda() gate_threshold = 0.5 first_iter = True decoder_inputs = init_decoder_inputs(memory, processed_memory, sequence_lengths) decoder_outputs = init_decoder_outputs(memory, sequence_lengths) if loop: if decoder_context is None: print("Running Tacotron2 Decoder with loop with ONNX-RT") decoder_inputs_onnxrt = [ x.cpu().numpy().copy() for x in decoder_inputs ] import onnx import onnxruntime sess = onnxruntime.InferenceSession(decoder_iter) with MeasureTime(measurements, "tacotron2_decoder_time"): result = sess.run( ["mel_outputs", "mel_lengths_t"], { 'decoder_input_0': decoder_inputs_onnxrt[0], 'attention_hidden_0': decoder_inputs_onnxrt[1], 'attention_cell_0': decoder_inputs_onnxrt[2], 'decoder_hidden_0': decoder_inputs_onnxrt[3], 'decoder_cell_0': decoder_inputs_onnxrt[4], 'attention_weights_0': decoder_inputs_onnxrt[5], 'attention_weights_cum_0': decoder_inputs_onnxrt[6], 'attention_context_0': decoder_inputs_onnxrt[7], 'memory': decoder_inputs_onnxrt[8], 'processed_memory': decoder_inputs_onnxrt[9], 'mask': decoder_inputs_onnxrt[10] }) mel_outputs = torch.tensor(result[0], device=device) mel_lengths = torch.tensor(result[1], device=device) else: print("Running Tacotron2 Decoder with loop") decoder_tensors = { "inputs": { 'decoder_input_0': decoder_inputs[0], 'attention_hidden_0': decoder_inputs[1], 'attention_cell_0': decoder_inputs[2], 'decoder_hidden_0': decoder_inputs[3], 'decoder_cell_0': decoder_inputs[4], 'attention_weights_0': decoder_inputs[5], 'attention_weights_cum_0': decoder_inputs[6], 'attention_context_0': decoder_inputs[7], 'memory': decoder_inputs[8], 'processed_memory': decoder_inputs[9], 'mask': decoder_inputs[10] }, "outputs": { 'mel_outputs': mel_outputs, 'mel_lengths_t': mel_lengths } } with MeasureTime(measurements, "tacotron2_decoder_time"): run_trt_engine(decoder_context, decoder_iter, decoder_tensors) mel_outputs = mel_outputs[:, :, :torch.max(mel_lengths)] else: print("Running Tacotron2 Decoder") measurements_decoder = {} while True: decoder_tensors = init_decoder_tensors(decoder_inputs, decoder_outputs) with MeasureTime(measurements_decoder, "step"): run_trt_engine(decoder_context, decoder_iter, decoder_tensors) if first_iter: mel_outputs = torch.unsqueeze(decoder_outputs[7], 2) gate_outputs = torch.unsqueeze(decoder_outputs[8], 2) alignments = torch.unsqueeze(decoder_outputs[4], 2) measurements['tacotron2_decoder_time'] = measurements_decoder[ 'step'] first_iter = False else: mel_outputs = torch.cat( (mel_outputs, torch.unsqueeze(decoder_outputs[7], 2)), 2) gate_outputs = torch.cat( (gate_outputs, torch.unsqueeze(decoder_outputs[8], 2)), 2) alignments = torch.cat( (alignments, torch.unsqueeze(decoder_outputs[4], 2)), 2) measurements['tacotron2_decoder_time'] += measurements_decoder[ 'step'] dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1) not_finished = not_finished * dec mel_lengths += not_finished if torch.sum(not_finished) == 0: print("Stopping after", mel_outputs.size(2), "decoder steps") break if mel_outputs.size(2) == max_decoder_steps: print("Warning! Reached max decoder steps") break decoder_inputs, decoder_outputs = swap_inputs_outputs( decoder_inputs, decoder_outputs) mel_outputs = mel_outputs.clone().detach() mel_outputs_postnet = torch.zeros_like(mel_outputs, device=device, dtype=dtype) postnet_tensors = { "inputs": { 'mel_outputs': mel_outputs }, "outputs": { 'mel_outputs_postnet': mel_outputs_postnet } } print("Running Tacotron2 Postnet") with MeasureTime(measurements, "tacotron2_postnet_time"): run_trt_engine(postnet_context, postnet, postnet_tensors) print("Tacotron2 Postnet done") return mel_outputs_postnet, mel_lengths
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) measurements_all = { "pre_processing": [], "tacotron2_encoder_time": [], "tacotron2_decoder_time": [], "tacotron2_postnet_time": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": [] } print("args:", args, unknown_args) torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, fp16_run=args.fp16, cpu_run=False, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() texts = [ "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." ] texts = [texts[0][:args.input_length]] texts = texts * args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing"): sequences_padded, input_lengths = prepare_input_sequence(texts) sequences_padded = sequences_padded.to(torch.int32) input_lengths = input_lengths.to(torch.int32) with torch.no_grad(): with MeasureTime(measurements, "latency"): with MeasureTime(measurements, "tacotron2_latency"): mel, mel_lengths = infer_tacotron2_trt( encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences_padded, input_lengths, measurements, args.fp16) with MeasureTime(measurements, "waveglow_latency"): audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) num_mels = mel.size(0) * mel.size(2) num_samples = audios.size(0) * audios.size(1) with MeasureTime(measurements, "type_conversion"): audios = audios.float() with MeasureTime(measurements, "data_transfer"): audios = audios.cpu() with MeasureTime(measurements, "storage"): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i] * args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels / measurements[ 'tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples / measurements[ 'waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples / measurements['latency'] if iter >= warmup_iters: for k, v in measurements.items(): if k in measurements_all.keys(): measurements_all[k].append(v) DLLogger.log(step=(iter - warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) model = load_and_setup_model(args.model_name, parser, None, args.amp_run) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): if i >= warmup_iters: LOGGER.iteration_start() measurements = {} if args.model_name == 'Tacotron2': text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140), dtype=torch.long).cuda() input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).cuda().long() with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _ = model.infer(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels num_mels = 895 mel_padded = torch.zeros(args.batch_size, n_mel_channels, num_mels).normal_(-5.62, 1.98).cuda() if args.amp_run: mel_padded = mel_padded.half() with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model.infer(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: LOGGER.log(key="items_per_sec", value=(num_items / measurements['inference_time'])) LOGGER.log(key="latency", value=measurements['inference_time']) LOGGER.iteration_stop() LOGGER.finish()
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) if args.synth_data: model = load_and_setup_model(args.model_name, parser, None, args.fp16, cpu_run=False, forward_is_infer=True) else: if not os.path.isfile(args.model): print(f"File {args.model} does not exist!") sys.exit(1) model = load_and_setup_model(args.model_name, parser, args.model, args.fp16, cpu_run=False, forward_is_infer=True) if args.model_name == "Tacotron2": model = torch.jit.script(model) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): measurements = {} if args.model_name == 'Tacotron2': text_padded, input_lengths = gen_text(args.synth_data) with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _, _ = model(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels mel_padded = gen_mel(args.synth_data, n_mel_channels, args.fp16) with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: DLLogger.log(step=(i - warmup_iters, ), data={"latency": measurements['inference_time']}) DLLogger.log(step=(i - warmup_iters, ), data={ "items_per_sec": num_items / measurements['inference_time'] }) DLLogger.log(step=tuple(), data={'infer_latency': measurements['inference_time']}) DLLogger.log(step=tuple(), data={ 'infer_items_per_sec': num_items / measurements['inference_time'] }) DLLogger.flush()