def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, multi_gpu, device, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary multi_gpu: true if using multiple gpus args: script input arguments """ logits_save_to = args.logits_save_to with torch.no_grad(): if args.wav: audio, audio_len = audio_from_file(args.wav) run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device) if args.export_model: jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export( audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args) run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels, device) return wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args, device) if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): print("==========>>>>>>Evaluation WER: {0}\n".format(wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] for batch in _global_var_dict["logits"]: for i in range(batch.shape[0]): logits.append(batch[i].cpu().numpy()) with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
def get_pytorch_components_and_onnx(args): '''Returns PyTorch components used for inference ''' model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] # Set up global labels for future vocab calls global _global_ctc_labels _global_ctc_labels = add_ctc_labels(dataset_vocab) featurizer_config = model_definition['input_eval'] optim_level = 3 if args.pyt_fp16 else 0 featurizer_config["optimization_level"] = optim_level audio_preprocessor = None onnx_path = None data_layer = None wav = None seq_len = None if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.dataset_dir is not None: data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=args.val_manifest, labels=dataset_vocab, batch_size=args.batch_size, shuffle=False) if args.wav is not None: args.batch_size = 1 wav, seq_len = audio_from_file(args.wav) if args.seq_len is None or args.seq_len == 0: args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100) if args.transpose: featurizer_config["transpose_out"] = True model_definition["transpose_in"] = True model = JasperEncoderDecoder(jasper_model_definition=model_definition, feat_in=1024, num_classes=len(get_vocab()), transpose_in=args.transpose) model = model.cuda() model.eval() audio_preprocessor = AudioPreprocessing(**featurizer_config) audio_preprocessor = audio_preprocessor.cuda() audio_preprocessor.eval() if args.ckpt_path is not None: if os.path.isdir(args.ckpt_path): d_checkpoint = torch.load(args.ckpt_path + "/decoder.pt", map_location="cpu") e_checkpoint = torch.load(args.ckpt_path + "/encoder.pt", map_location="cpu") model.jasper_encoder.load_state_dict(e_checkpoint, strict=False) model.jasper_decoder.load_state_dict(d_checkpoint, strict=False) else: checkpoint = torch.load(args.ckpt_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) # if we are to produce engine, not run/create ONNX, postpone AMP initialization # (ONNX parser cannot handle mixed FP16 ONNX yet) if args.pyt_fp16 and args.engine_path is None: amp.initialize(models=model, opt_level=AmpOptimizations[optim_level]) if args.make_onnx: if args.onnx_path is None or args.ckpt_path is None: raise Exception( "--ckpt_path, --onnx_path must be provided when using --make_onnx" ) onnx_path = get_onnx(args.onnx_path, model, args) if args.pyt_fp16 and args.engine_path is not None: amp.initialize(models=model, opt_level=AmpOptimizations[optim_level]) return { 'data_layer': data_layer, 'audio_preprocessor': audio_preprocessor, 'acoustic_model': model, 'input_wav': (wav, seq_len) }, onnx_path
def eval( data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, multi_gpu, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary multi_gpu: true if using multiple gpus args: script input arguments """ if args.ipex: import intel_extension_for_pytorch as ipex logits_save_to=args.logits_save_to encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits' : [], } if args.wav: # TODO unimplemented in ipex assert False, "wav unsupported in ipex for now" features, p_length_e = audio_processor(audio_from_file(args.wav)) # torch.cuda.synchronize() t0 = time.perf_counter() t_log_probs_e = encoderdecoder(features) # torch.cuda.synchronize() t1 = time.perf_counter() t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels) print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0)) print("TRANSCRIPT\t\t:", hypotheses[0]) return steps_per_epoch = len(data_layer) total_steps = args.steps if args.steps is not None else steps_per_epoch test_epoches = int(total_steps / steps_per_epoch) print('Evaluating RNNT: Steps per Epoch {} total Steps {}'.format(steps_per_epoch, total_steps)) # Int8 Calibration if args.ipex and args.int8 and args.calibration: print("runing int8 calibration step\n") conf = ipex.AmpConf(torch.int8) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) t_predictions_e, conf = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if args.steps is not None and it + 1 >= args.steps: break conf.save(args.configure_dir) # Inference (vanilla cpu, dnnl fp32 or dnnl int8) else: if not args.ipex: if args.warm_up > 0: print("\nstart warm up, warmp_up steps = ", args.warm_up) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) conf = None t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if it + 1 >= args.warm_up: break print("\nstart measure performance, measure steps = ", total_steps) total_time = 0 with tqdm(total=total_steps) as pbar: for epoch in range(test_epoches + 1): for it, data in enumerate(data_layer.data_iterator): if epoch * steps_per_epoch + it >= total_steps: break t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) if args.profiling: # with torch.autograd.profiler.profile(args.profiling) as prof: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() else: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() total_time += (t1 - t0) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) pbar.update(1) else: if args.mix_precision: with torch.cpu.amp.autocast(): # warm up if args.warm_up > 0: print("\nstart warm up, warmp_up steps = ", args.warm_up) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) conf = None t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if it + 1 >= args.warm_up: break # measure performance print("\nstart measure performance, measure steps = ", total_steps) total_time = 0 # with torch.autograd.profiler.profile(args.profiling) as prof: with tqdm(total=total_steps) as pbar: for epoch in range(test_epoches + 1): for it, data in enumerate(data_layer.data_iterator): if epoch * steps_per_epoch + it >= total_steps: break t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) if args.profiling: # with torch.autograd.profiler.profile(args.profiling) as prof: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() else: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() total_time += (t1 - t0) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) pbar.update(1) else: # warm up if args.warm_up > 0: print("\nstart warm up, warmp_up steps = ", args.warm_up) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) conf = None t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if it + 1 >= args.warm_up: break # measure performance print("\nstart measure performance, measure steps = ", total_steps) total_time = 0 # with torch.autograd.profiler.profile(args.profiling) as prof: with tqdm(total=total_steps) as pbar: for epoch in range(test_epoches + 1): for it, data in enumerate(data_layer.data_iterator): if epoch * steps_per_epoch + it >= total_steps: break t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) if args.profiling: # with torch.autograd.profiler.profile(args.profiling) as prof: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() else: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() total_time += (t1 - t0) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) pbar.update(1) if args.print_result: hypotheses = _global_var_dict['predictions'] references = _global_var_dict['transcripts'] nb = len(hypotheses) print("print %d sample results: " % (min(len(hypotheses), nb))) for i, item in enumerate(hypotheses): print("hyp: ", hypotheses[i]) print("ref: ", references[i]) print() if i > nb: break if args.profiling: # print(prof.key_averages().table(sort_by="cpu_time_total")) print(prof.key_averages().table(sort_by="self_cpu_time_total")) wer, _ = process_evaluation_epoch(_global_var_dict) if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): print("\n=========================>>>>>>") print("Evaluation WER: {0}".format(wer)) print("Accuracy: {:.15f} ".format(1 - wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] for batch in _global_var_dict["logits"]: for i in range(batch.shape[0]): logits.append(batch[i].cpu().numpy()) with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL) if args.steps: total_samples = args.steps * args.batch_size else: total_samples = len(data_layer) print("total samples tested: ", total_samples) print("total time (encoder + decoder, excluded audio processing): ", total_time, "s") print("dataset size: ", len(data_layer)) perf = total_samples / total_time print("Throughput: {:.3f} fps".format(perf))
def eval( data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, multi_gpu, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary multi_gpu: true if using multiple gpus args: script input arguments """ logits_save_to=args.logits_save_to audio_processor.eval() encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits' : [], } if args.wav: features, p_length_e = audio_processor(audio_from_file(args.wav)) torch.cuda.synchronize() t0 = time.perf_counter() t_log_probs_e = encoderdecoder(features) torch.cuda.synchronize() t1 = time.perf_counter() t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels) print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0)) print("TRANSCRIPT\t\t:", hypotheses[0]) return for it, data in enumerate(tqdm(data_layer.data_iterator)): tensors = [] for d in data: tensors.append(d.cuda()) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors inp = (t_audio_signal_e, t_a_sig_length_e) t_processed_signal, p_length_e = audio_processor(x=inp) if args.use_conv_mask: t_log_probs_e, t_encoded_len_e = encoderdecoder((t_processed_signal, p_length_e)) else: t_log_probs_e = encoderdecoder(t_processed_signal) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], output=[t_log_probs_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) if args.steps is not None and it + 1 >= args.steps: break wer, _ = process_evaluation_epoch(_global_var_dict) if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): print("==========>>>>>>Evaluation WER: {0}\n".format(wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] for batch in _global_var_dict["logits"]: for i in range(batch.shape[0]): logits.append(batch[i].cpu().numpy()) with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
def get_pytorch_components_and_onnx(args): '''Returns PyTorch components used for inference ''' model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] # Set up global labels for future vocab calls global _global_ctc_labels _global_ctc_labels = add_ctc_labels(dataset_vocab) featurizer_config = model_definition['input_eval'] optim_level = Optimization.mxprO3 if args.pyt_fp16 else Optimization.mxprO0 featurizer_config["optimization_level"] = optim_level acoustic_model = None audio_preprocessor = None onnx_path = None data_layer = None wav = None seq_len = None dtype = torch.float if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.dataset_dir is not None: data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=args.val_manifest, labels=dataset_vocab, batch_size=args.batch_size, shuffle=False) if args.wav is not None: args.batch_size = 1 args.engine_batch_size = 1 wav, seq_len = audio_from_file(args.wav) if args.seq_len is None or args.seq_len == 0: args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100) model = Jasper(feature_config=featurizer_config, jasper_model_definition=model_definition, feat_in=1024, num_classes=len(get_vocab())) model.cuda() model.eval() acoustic_model = model.acoustic_model audio_preprocessor = model.audio_preprocessor if args.ckpt_path is not None: checkpoint = torch.load(args.ckpt_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) if args.make_onnx: if args.onnx_path is None or acoustic_model is None: raise Exception( "--ckpt_path, --onnx_path must be provided when using --make_onnx" ) onnx_path = get_onnx(args.onnx_path, acoustic_model, signal_shape=(args.engine_batch_size, 64, args.seq_len), dtype=torch.float) if args.pyt_fp16: amp.initialize(models=acoustic_model, opt_level=AmpOptimizations[optim_level]) return { 'data_layer': data_layer, 'audio_preprocessor': audio_preprocessor, 'acoustic_model': acoustic_model, 'input_wav': (wav, seq_len) }, onnx_path
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args): """performs evaluation and prints performance statistics Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary args: script input arguments """ batch_size = args.batch_size steps = args.steps audio_processor.eval() encoderdecoder.eval() greedy_decoder.eval() # TORCHSCRIPT if args.cpu_run: audio, audio_len = audio_from_file(args.sample_audio, cpu_run=True) jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export( audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args) with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], } it = 0 ep = 0 if steps is None: steps = math.ceil(len(data_layer) / batch_size) durations_dnn = [] durations_dnn_and_prep = [] seq_lens = [] while True: ep += 1 for data in tqdm(data_layer.data_iterator): it += 1 if it > steps: break tensors = [] dl_device = torch.device( "cpu") if args.cpu_run else torch.device("cuda") for d in data: tensors.append(d.to(dl_device)) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors if not args.cpu_run: torch.cuda.synchronize() t0 = time.perf_counter() t_processed_signal = audio_processor( t_audio_signal_e, t_a_sig_length_e) torch.cuda.synchronize() t1 = time.perf_counter() t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal) torch.cuda.synchronize() stop_time = time.perf_counter() time_prep_and_dnn = stop_time - t0 time_dnn = stop_time - t1 t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) if args.cpu_run: t0 = time.perf_counter() t_processed_signal, _ = jit_audio_processor( t_audio_signal_e, t_a_sig_length_e) t1 = time.perf_counter() t_log_probs_e, _ = jit_encoderdecoder(t_processed_signal) stop_time = time.perf_counter() time_prep_and_dnn = stop_time - t0 time_dnn = stop_time - t1 t_predictions_e = jit_greedy_decoder( log_probs=t_log_probs_e) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) durations_dnn.append(time_dnn) durations_dnn_and_prep.append(time_prep_and_dnn) seq_lens.append(t_processed_signal[0].shape[-1]) if it >= steps: wer, _ = process_evaluation_epoch(_global_var_dict) print( "==========>>>>>>Evaluation of all iterations WER: {0}\n". format(wer)) break ratios = [0.9, 0.95, 0.99, 1.] latencies_dnn = take_durations_and_output_percentile( durations_dnn, ratios) latencies_dnn_and_prep = take_durations_and_output_percentile( durations_dnn_and_prep, ratios) print("\n using batch size {} and {} frames ".format( batch_size, seq_lens[-1])) print("\n".join([ "dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn.items() ])) print("\n".join([ "prep + dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn_and_prep.items() ]))