def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, multi_gpu, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary multi_gpu: true if using multiple gpus args: script input arguments """ logits_save_to = args.logits_save_to audio_processor.eval() encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits': [], } for it, data in enumerate(tqdm(data_layer.data_iterator)): tensors = [] for d in data: tensors.append(d.cuda()) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors inp = (t_audio_signal_e, t_a_sig_length_e) t_processed_signal, p_length_e = audio_processor(x=inp) t_log_probs_e, _ = encoderdecoder((t_processed_signal, p_length_e)) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict(predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], output=[t_log_probs_e]) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) if args.steps is not None and it + 1 >= args.steps: break wer, _ = process_evaluation_epoch(_global_var_dict) if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): print("==========>>>>>>Evaluation WER: {0}\n".format(wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] for batch in _global_var_dict["logits"]: for i in range(batch.shape[0]): logits.append(batch[i].cpu().numpy()) with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary args: script input arguments """ logits_save_to = args.logits_save_to encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits': [], } for it, data in enumerate(tqdm(data_layer.data_iterator)): (t_audio_signal_e, t_a_sig_length_e, transcript_list, t_transcript_e, t_transcript_len_e) = audio_processor(data) # t_log_probs_e, (_, _) = torch.jit.trace(encoderdecoder, # ((t_audio_signal_e, t_transcript_e), # (t_a_sig_length_e, t_transcript_len_e),), # ) # This is basically totaly useless. The encoder doesn't mean # anything by themsleves, in the case of RNN-T # t_log_probs_e, (_, _) = encoderdecoder( # ((t_audio_signal_e, t_transcript_e), # (t_a_sig_length_e, t_transcript_len_e),) # ) t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e) values_dict = dict( predictions=[t_predictions_e], transcript=transcript_list, transcript_length=t_transcript_len_e, ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) if args.steps is not None and it + 1 >= args.steps: break wer = process_evaluation_epoch(_global_var_dict) print("==========>>>>>>Evaluation WER: {0}\n".format(wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
def eval(model, name=''): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() if optim_level == 1: with amp.disable_casts(): t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor( t_audio_signal_e, t_a_sig_length_e) else: t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor( t_audio_signal_e, t_a_sig_length_e) if jasper_encoder.use_conv_mask: t_log_probs_e, t_encoded_len_e = model.forward( (t_processed_signal_e, t_processed_sig_length_e)) else: t_log_probs_e = model.forward(t_processed_signal_e) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict(loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e]) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) if name != '': name = '_' + name print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n") print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n")
def main(): args = get_args() labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), normalize=True, max_duration=15.0) with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: results = json.load(fh) hypotheses = [] references = [] for result in results: hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) references.append(manifest[result["qsl_idx"]]["transcript"]) references = __gather_predictions([references], labels=labels) hypotheses = __gather_predictions([hypotheses], labels=labels) d = dict(predictions=hypotheses, transcripts=references) wer = process_evaluation_epoch(d) print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
def calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args): encoderdecoder = encoderdecoder.module if hasattr(encoderdecoder, 'module') else encoderdecoder with torch.no_grad(): # reset global_var_dict - results of evaluation will be stored there _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits' : [], } # Evaluation mini-batch for loop for it, data in enumerate(tqdm(data_layer.data_iterator)): tensors = [] dl_device = torch.device("cpu") if args.cpu_run else torch.device("cuda") for d in data: tensors.append(d.to(dl_device)) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e) t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal) t_predictions_e = greedy_decoder(t_log_probs_e) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], output=[t_log_probs_e] ) # values_dict will contain results from all workers process_evaluation_batch(values_dict, _global_var_dict, labels=labels) if args.steps is not None and it + 1 >= args.steps: break # final aggregation (over minibatches) and logging of results wer, _ = process_evaluation_epoch(_global_var_dict) return wer, _global_var_dict
def main(): args = get_args() labels = [ " ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'" ] qsl = AudioQSL(args.dataset_dir, args.manifest, labels) manifest = qsl.manifest with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: results = json.load(fh) hypotheses = [] references = [] for result in results: hypotheses.append( array.array('q', bytes.fromhex(result["data"])).tolist()) references.append(manifest[result["qsl_idx"]]["transcript"]) hypotheses = __gather_predictions([hypotheses], labels=labels) references = __gather_predictions([references], labels=labels) d = dict(predictions=hypotheses, transcripts=references) print("Word Error Rate:", process_evaluation_epoch(d))
def get_results(log_probs, original_tensors, batch_size): ''' Returns WER and predictions for the outputs of the acoustic model Used for one-off batches. Epoch-wide evaluation should use global_process_batch and global_process_epoch ''' # Used to get WER and predictions for one-off batches greedy_decoder = GreedyCTCDecoder() predicts = norm(greedy_decoder(log_probs=log_probs)) values_dict = dict( predictions=[predicts], transcript=[original_tensors[2][0:batch_size, ...]], transcript_length=[original_tensors[3][0:batch_size, ...]], ) temp_dict = { 'predictions': [], 'transcripts': [], } process_evaluation_batch(values_dict, temp_dict, labels=get_vocab()) predictions = temp_dict['predictions'] wer, _ = process_evaluation_epoch(temp_dict) return wer, predictions
def eval(): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e)) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) print_once("==========>>>>>>Evaluation Loss: {0}\n".format(eloss)) print_once("==========>>>>>>Evaluation WER: {0}\n".format(wer))
def eval( data_layer, audio_processor, greedy_decoder, labels, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module greedy_decoder: greedy decoder labels: list of labels as output vocabulary args: script input arguments """ start_t = time.time() if args.mode==1 or args.mode==2: rnnt_hw_model = RNNT_infer_model() else: rnnt_hw_model = None logits_save_to = args.logits_save_to with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits': [], } Processnum = [] for it, data in enumerate(data_layer.data_iterator): if args.mode == 3: (t_audio_signal_e, t_a_sig_length_e, transcript_list, t_transcript_e, t_transcript_len_e) = audio_processor(data) h_rnns =(None,None) label=[] hidden = None #greedy decode on cpu t_transcript_e = torch.nn.utils.rnn.pad_packed_sequence(t_transcript_e, batch_first=True)[0] t_predictions_e, h_pre_rnns, hidden_predict, decode_batch_length = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, h_rnns,label,hidden, None) else: Process_ver = MyProcess(data, audio_processor, _global_var_dict, labels, ver_Process,rnnt_hw_model,greedy_decoder) Process_ver.start() Processnum.append(it) if args.steps is not None and it + 1 >= args.steps: break if args.mode !=3: for id in Processnum: Process_ver.join() else: values_dict = dict( predictions=[t_predictions_e], transcript=transcript_list, transcript_length=t_transcript_len_e, ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) wer = process_evaluation_epoch(_global_var_dict) print("=================>Evaluation WER: {0}\n".format(wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) end_t =time.time() if args.mode == 1: print('dpu computation time (lstm_run time)', sum(lstm_run_time_t)) print('e2e decode time:',end_t-start_t)
def global_process_epoch(is_trt=True): '''Returns WER in accumulated global dictionary ''' dict_to_process = _global_trt_dict if is_trt else _global_pyt_dict wer, _ = process_evaluation_epoch(dict_to_process) return wer
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args): """performs evaluation and prints performance statistics Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary args: script input arguments """ batch_size = args.batch_size steps = args.steps audio_processor.eval() encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], } it = 0 ep = 0 if steps is None: steps = math.ceil(len(data_layer) / batch_size) durations_dnn = [] durations_dnn_and_prep = [] seq_lens = [] while True: ep += 1 for data in tqdm(data_layer.data_iterator): it += 1 if it > steps: break tensors = [] dl_device = torch.device("cuda") for d in data: tensors.append(d.to(dl_device)) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors inp = (t_audio_signal_e, t_a_sig_length_e) torch.cuda.synchronize() t0 = time.perf_counter() t_processed_signal, p_length_e = audio_processor(x=inp) torch.cuda.synchronize() t1 = time.perf_counter() t_log_probs_e, _ = encoderdecoder( (t_processed_signal, p_length_e)) torch.cuda.synchronize() stop_time = time.perf_counter() time_prep_and_dnn = stop_time - t0 time_dnn = stop_time - t1 t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) durations_dnn.append(time_dnn) durations_dnn_and_prep.append(time_prep_and_dnn) seq_lens.append(t_processed_signal.shape[-1]) if it >= steps: wer, _ = process_evaluation_epoch(_global_var_dict) print( "==========>>>>>>Evaluation of all iterations WER: {0}\n". format(wer)) break ratios = [0.9, 0.95, 0.99, 1.] latencies_dnn = take_durations_and_output_percentile( durations_dnn, ratios) latencies_dnn_and_prep = take_durations_and_output_percentile( durations_dnn_and_prep, ratios) print("\n using batch size {} and {} frames ".format( batch_size, seq_lens[-1])) print("\n".join([ "dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn.items() ])) print("\n".join([ "prep + dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn_and_prep.items() ]))
def eval( data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, multi_gpu, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary multi_gpu: true if using multiple gpus args: script input arguments """ if args.ipex: import intel_extension_for_pytorch as ipex logits_save_to=args.logits_save_to encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits' : [], } if args.wav: # TODO unimplemented in ipex assert False, "wav unsupported in ipex for now" features, p_length_e = audio_processor(audio_from_file(args.wav)) # torch.cuda.synchronize() t0 = time.perf_counter() t_log_probs_e = encoderdecoder(features) # torch.cuda.synchronize() t1 = time.perf_counter() t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels) print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0)) print("TRANSCRIPT\t\t:", hypotheses[0]) return steps_per_epoch = len(data_layer) total_steps = args.steps if args.steps is not None else steps_per_epoch test_epoches = int(total_steps / steps_per_epoch) print('Evaluating RNNT: Steps per Epoch {} total Steps {}'.format(steps_per_epoch, total_steps)) # Int8 Calibration if args.ipex and args.int8 and args.calibration: print("runing int8 calibration step\n") conf = ipex.AmpConf(torch.int8) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) t_predictions_e, conf = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if args.steps is not None and it + 1 >= args.steps: break conf.save(args.configure_dir) # Inference (vanilla cpu, dnnl fp32 or dnnl int8) else: if not args.ipex: if args.warm_up > 0: print("\nstart warm up, warmp_up steps = ", args.warm_up) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) conf = None t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if it + 1 >= args.warm_up: break print("\nstart measure performance, measure steps = ", total_steps) total_time = 0 with tqdm(total=total_steps) as pbar: for epoch in range(test_epoches + 1): for it, data in enumerate(data_layer.data_iterator): if epoch * steps_per_epoch + it >= total_steps: break t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) if args.profiling: # with torch.autograd.profiler.profile(args.profiling) as prof: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() else: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() total_time += (t1 - t0) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) pbar.update(1) else: if args.mix_precision: with torch.cpu.amp.autocast(): # warm up if args.warm_up > 0: print("\nstart warm up, warmp_up steps = ", args.warm_up) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) conf = None t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if it + 1 >= args.warm_up: break # measure performance print("\nstart measure performance, measure steps = ", total_steps) total_time = 0 # with torch.autograd.profiler.profile(args.profiling) as prof: with tqdm(total=total_steps) as pbar: for epoch in range(test_epoches + 1): for it, data in enumerate(data_layer.data_iterator): if epoch * steps_per_epoch + it >= total_steps: break t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) if args.profiling: # with torch.autograd.profiler.profile(args.profiling) as prof: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() else: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() total_time += (t1 - t0) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) pbar.update(1) else: # warm up if args.warm_up > 0: print("\nstart warm up, warmp_up steps = ", args.warm_up) for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) conf = None t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) if it + 1 >= args.warm_up: break # measure performance print("\nstart measure performance, measure steps = ", total_steps) total_time = 0 # with torch.autograd.profiler.profile(args.profiling) as prof: with tqdm(total=total_steps) as pbar: for epoch in range(test_epoches + 1): for it, data in enumerate(data_layer.data_iterator): if epoch * steps_per_epoch + it >= total_steps: break t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) if args.profiling: # with torch.autograd.profiler.profile(args.profiling) as prof: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() else: conf = None t0 = time.perf_counter() t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf) t1 = time.perf_counter() total_time += (t1 - t0) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) pbar.update(1) if args.print_result: hypotheses = _global_var_dict['predictions'] references = _global_var_dict['transcripts'] nb = len(hypotheses) print("print %d sample results: " % (min(len(hypotheses), nb))) for i, item in enumerate(hypotheses): print("hyp: ", hypotheses[i]) print("ref: ", references[i]) print() if i > nb: break if args.profiling: # print(prof.key_averages().table(sort_by="cpu_time_total")) print(prof.key_averages().table(sort_by="self_cpu_time_total")) wer, _ = process_evaluation_epoch(_global_var_dict) if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): print("\n=========================>>>>>>") print("Evaluation WER: {0}".format(wer)) print("Accuracy: {:.15f} ".format(1 - wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] for batch in _global_var_dict["logits"]: for i in range(batch.shape[0]): logits.append(batch[i].cpu().numpy()) with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL) if args.steps: total_samples = args.steps * args.batch_size else: total_samples = len(data_layer) print("total samples tested: ", total_samples) print("total time (encoder + decoder, excluded audio processing): ", total_time, "s") print("dataset size: ", len(data_layer)) perf = total_samples / total_time print("Throughput: {:.3f} fps".format(perf))
def eval( data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, multi_gpu, args): """performs inference / evaluation Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary multi_gpu: true if using multiple gpus args: script input arguments """ logits_save_to=args.logits_save_to encoderdecoder.eval() with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], 'logits' : [], } if args.wav: features, p_length_e = audio_processor(audio_from_file(args.wav)) torch.cuda.synchronize() t0 = time.perf_counter() t_log_probs_e = encoderdecoder(features) torch.cuda.synchronize() t1 = time.perf_counter() t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels) print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0)) print("TRANSCRIPT\t\t:", hypotheses[0]) return for it, data in enumerate(tqdm(data_layer.data_iterator)): t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) t_log_probs_e, (x_len, y_len) = encoderdecoder( ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), ) t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], output=[t_log_probs_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) if args.steps is not None and it + 1 >= args.steps: break wer, _ = process_evaluation_epoch(_global_var_dict) if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): print("==========>>>>>>Evaluation WER: {0}\n".format(wer)) if args.save_prediction is not None: with open(args.save_prediction, 'w') as fp: fp.write('\n'.join(_global_var_dict['predictions'])) if logits_save_to is not None: logits = [] for batch in _global_var_dict["logits"]: for i in range(batch.shape[0]): logits.append(batch[i].cpu().numpy()) with open(logits_save_to, 'wb') as f: pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
def evalutaion(epoch=0): model.eval() if args.ipex: if args.bf16: print("running bfloat16 evaluation step\n") else: print("running fp32 evaluation step\n") for dataset, frequency, name in eval_datasets: if epoch % frequency != 0: continue print_once(f"Doing {name} ....................... ...... ... .. . .") with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } dataloader = dataset.data_iterator for data in dataloader: t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = data_transforms(data) if args.ipex: if args.bf16: with torch.cpu.amp.autocast(): t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) elif args.fp32: t_log_probs_e, (x_len, y_len) = model( ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), ) else: t_log_probs_e, (x_len, y_len) = model( ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), ) t_loss_e = loss_fn( (t_log_probs_e, x_len), (t_transcript_e, y_len) ) print(t_loss_e) del t_log_probs_e t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e) values_dict = dict( loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) logger.log_scalar('loss', eloss, epoch, name) logger.log_scalar('wer', wer, epoch, name) print_once(f"==========>>>>>>{name} Loss: {eloss}\n") print_once(f"==========>>>>>>{name} WER: {wer}\n")
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, device, args): """performs evaluation and prints performance statistics Args: data_layer: data layer object that holds data loader audio_processor: data processing module encoderdecoder: acoustic model greedy_decoder: greedy decoder labels: list of labels as output vocabulary args: script input arguments """ batch_size = args.batch_size steps = args.steps audio_processor.eval() encoderdecoder.eval() greedy_decoder.eval() if args.torch_script: audio, audio_len = audio_from_file(args.sample_audio, device=device) audio_processor, encoderdecoder, greedy_decoder = jit_export( audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args) with torch.no_grad(): _global_var_dict = { 'predictions': [], 'transcripts': [], } it = 0 ep = 0 if steps is None: steps = math.ceil(len(data_layer) / batch_size) durations_dnn = [] durations_dnn_and_prep = [] seq_lens = [] sync = lambda: torch.cuda.synchronize( ) if device.type == 'cuda' else None while True: ep += 1 for data in tqdm(data_layer.data_iterator): it += 1 if it > steps: break tensors = [t.to(device) for t in data] t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors sync() t0 = time.perf_counter() features, lens = audio_processor(t_audio_signal_e, t_a_sig_length_e) sync() t1 = time.perf_counter() if isinstance(encoderdecoder, torch.jit.TracedModule): t_log_probs_e = encoderdecoder(features) else: t_log_probs_e, _ = encoderdecoder.infer((features, lens)) sync() stop_time = time.perf_counter() time_prep_and_dnn = stop_time - t0 time_dnn = stop_time - t1 t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e], ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) durations_dnn.append(time_dnn) durations_dnn_and_prep.append(time_prep_and_dnn) seq_lens.append(features[0].shape[-1]) if it >= steps: wer, _ = process_evaluation_epoch(_global_var_dict) print( "==========>>>>>>Evaluation of all iterations WER: {0}\n". format(wer)) break ratios = [0.9, 0.95, 0.99, 1.] latencies_dnn = take_durations_and_output_percentile( durations_dnn, ratios) latencies_dnn_and_prep = take_durations_and_output_percentile( durations_dnn_and_prep, ratios) print("\n using batch size {} and {} frames ".format( batch_size, seq_lens[-1])) print("\n".join([ "dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn.items() ])) print("\n".join([ "prep + dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn_and_prep.items() ]))