def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels,
         multi_gpu, args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        multi_gpu: true if using multiple gpus
        args: script input arguments
    """
    logits_save_to = args.logits_save_to
    audio_processor.eval()
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits': [],
        }

        for it, data in enumerate(tqdm(data_layer.data_iterator)):
            tensors = []
            for d in data:
                tensors.append(d.cuda())

            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

            inp = (t_audio_signal_e, t_a_sig_length_e)

            t_processed_signal, p_length_e = audio_processor(x=inp)
            t_log_probs_e, _ = encoderdecoder((t_processed_signal, p_length_e))
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

            values_dict = dict(predictions=[t_predictions_e],
                               transcript=[t_transcript_e],
                               transcript_length=[t_transcript_len_e],
                               output=[t_log_probs_e])
            process_evaluation_batch(values_dict,
                                     _global_var_dict,
                                     labels=labels)

            if args.steps is not None and it + 1 >= args.steps:
                break
        wer, _ = process_evaluation_epoch(_global_var_dict)
        if (not multi_gpu
                or (multi_gpu and torch.distributed.get_rank() == 0)):
            print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
            if args.save_prediction is not None:
                with open(args.save_prediction, 'w') as fp:
                    fp.write('\n'.join(_global_var_dict['predictions']))
            if logits_save_to is not None:
                logits = []
                for batch in _global_var_dict["logits"]:
                    for i in range(batch.shape[0]):
                        logits.append(batch[i].cpu().numpy())
                with open(logits_save_to, 'wb') as f:
                    pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
Example #2
0
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels,
         args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        args: script input arguments
    """
    logits_save_to = args.logits_save_to
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits': [],
        }

        for it, data in enumerate(tqdm(data_layer.data_iterator)):
            (t_audio_signal_e, t_a_sig_length_e, transcript_list,
             t_transcript_e, t_transcript_len_e) = audio_processor(data)

            # t_log_probs_e, (_, _) = torch.jit.trace(encoderdecoder,
            #     ((t_audio_signal_e, t_transcript_e),
            #      (t_a_sig_length_e, t_transcript_len_e),),
            # )

            # This is basically totaly useless. The encoder doesn't mean
            # anything by themsleves, in the case of RNN-T
            # t_log_probs_e, (_, _) = encoderdecoder(
            #     ((t_audio_signal_e, t_transcript_e),
            #      (t_a_sig_length_e, t_transcript_len_e),)
            # )
            t_predictions_e = greedy_decoder.decode(t_audio_signal_e,
                                                    t_a_sig_length_e)

            values_dict = dict(
                predictions=[t_predictions_e],
                transcript=transcript_list,
                transcript_length=t_transcript_len_e,
            )
            process_evaluation_batch(values_dict,
                                     _global_var_dict,
                                     labels=labels)

            if args.steps is not None and it + 1 >= args.steps:
                break
        wer = process_evaluation_epoch(_global_var_dict)
        print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
        if args.save_prediction is not None:
            with open(args.save_prediction, 'w') as fp:
                fp.write('\n'.join(_global_var_dict['predictions']))
        if logits_save_to is not None:
            logits = []
            with open(logits_save_to, 'wb') as f:
                pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
    def eval(model, name=''):
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                if optim_level == 1:
                    with amp.disable_casts():
                        t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(
                            t_audio_signal_e, t_a_sig_length_e)
                else:
                    t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(
                        t_audio_signal_e, t_a_sig_length_e)
                if jasper_encoder.use_conv_mask:
                    t_log_probs_e, t_encoded_len_e = model.forward(
                        (t_processed_signal_e, t_processed_sig_length_e))
                else:
                    t_log_probs_e = model.forward(t_processed_signal_e)
                t_loss_e = ctc_loss(log_probs=t_log_probs_e,
                                    targets=t_transcript_e,
                                    input_length=t_encoded_len_e,
                                    target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(loss=[t_loss_e],
                                   predictions=[t_predictions_e],
                                   transcript=[t_transcript_e],
                                   transcript_length=[t_transcript_len_e])
                process_evaluation_batch(values_dict,
                                         _global_var_dict,
                                         labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            wer, eloss = process_evaluation_epoch(_global_var_dict)

            if name != '':
                name = '_' + name

            print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n")
            print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n")
def global_process_batch(log_probs, original_tensors, batch_size, is_trt=True):
    '''Accumulates prediction evaluations for batches across an epoch

    is_trt determines which global dictionary will be used.
    To get WER at any point, use global_process_epoch.
    For one-off WER evaluations, use get_results()
    '''
    # State-based approach for full WER comparison across a dataset.
    greedy_decoder = GreedyCTCDecoder()
    predicts = norm(greedy_decoder(log_probs=log_probs))
    values_dict = dict(
        predictions=[predicts],
        transcript=[original_tensors[2][0:batch_size, ...]],
        transcript_length=[original_tensors[3][0:batch_size, ...]],
    )
    dict_to_process = _global_trt_dict if is_trt else _global_pyt_dict
    process_evaluation_batch(values_dict, dict_to_process, labels=get_vocab())
Example #5
0
    def eval():
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e))
                t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(
                    loss=[t_loss_e],
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e]
                )
                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            #lnw modified for cer
            #wer, eloss = process_evaluation_epoch(_global_var_dict)
            wer, eloss, cer = process_evaluation_epoch2(_global_var_dict)
        
            print_once("==========>>>>>>Evaluation Loss: {0}".format(eloss))
            print_once("==========>>>>>>Evaluation WER: {0}".format(wer))
            #lnw add for cer
            print_once("==========>>>>>>Evaluation CER: {0}".format(cer))
            #lnw add
            print("Evaluation end time : "+str(datetime.now()))
def calc_wer(data_layer, audio_processor, 
             encoderdecoder, greedy_decoder, 
             labels, args):

    encoderdecoder = encoderdecoder.module if hasattr(encoderdecoder, 'module') else encoderdecoder
    with torch.no_grad():
        # reset global_var_dict - results of evaluation will be stored there
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits' : [],
        }

        # Evaluation mini-batch for loop
        for it, data in enumerate(tqdm(data_layer.data_iterator)):

            tensors = []
            dl_device = torch.device("cpu") if args.cpu_run else torch.device("cuda")
            for d in data:
                tensors.append(d.to(dl_device))
    
            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
    
            t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e) 
            t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal)
            t_predictions_e = greedy_decoder(t_log_probs_e)
    
            values_dict = dict(
                predictions=[t_predictions_e],
                transcript=[t_transcript_e],
                transcript_length=[t_transcript_len_e],
                output=[t_log_probs_e]
            )
            # values_dict will contain results from all workers
            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
    
            if args.steps is not None and it + 1 >= args.steps:
                break

        # final aggregation (over minibatches) and logging of results
        wer, _ = process_evaluation_epoch(_global_var_dict)

        return wer, _global_var_dict
Example #7
0
def ver_Process(data, audio_processor, _global_var_dict, labels, rnnt_hw_model, greedy_decoder):
        s = time.time()
        (t_audio_signal_e, t_a_sig_length_e,
            transcript_list, t_transcript_e,
            t_transcript_len_e) = audio_processor(data)
        h_rnns =(None,None)
        label=[]
        hidden = None
        mode = args.mode
        thread_Lock.acquire()
        if mode ==1:
                #dpu inference on dev clean
                s_t = time.time()
                (t_predictions_e, lstm_run_time,
                    convert_time, read_time,
                    max_length) = rnnt_hw_model.run_librispeech(t_audio_signal_e, t_a_sig_length_e)
                rnnt_hw_model.rnnt_reflash_ddr()
                e_t = time.time()
                hard_time.append(e_t-s_t)
                lstm_run_time_t.append(lstm_run_time)
        
        elif mode == 2:
            # test long wav
            wav_path = './my_work_dir/demo/demo4.wav'

            rnnt_hw_model.print_wav_info(t_transcript_e.data)
            cut_time = 6
            t_predictions_e = rnnt_hw_model.run_long_demo(wav_path, fbank, cut_time)

        else:
            print('mode not supported, please check it')
        
        values_dict = dict(
                predictions=[t_predictions_e],
                transcript=transcript_list,
                transcript_length=t_transcript_len_e,
        )
        process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
        thread_Lock.release()
def get_results(log_probs, original_tensors, batch_size):
    ''' Returns WER and predictions for the outputs of the acoustic model

    Used for one-off batches. Epoch-wide evaluation should use
    global_process_batch and global_process_epoch
    '''
    # Used to get WER and predictions for one-off batches
    greedy_decoder = GreedyCTCDecoder()
    predicts = norm(greedy_decoder(log_probs=log_probs))
    values_dict = dict(
        predictions=[predicts],
        transcript=[original_tensors[2][0:batch_size, ...]],
        transcript_length=[original_tensors[3][0:batch_size, ...]],
    )
    temp_dict = {
        'predictions': [],
        'transcripts': [],
    }
    process_evaluation_batch(values_dict, temp_dict, labels=get_vocab())
    predictions = temp_dict['predictions']
    wer, _ = process_evaluation_epoch(temp_dict)
    return wer, predictions
Example #9
0
def eval(
        data_layer,
        audio_processor,
        greedy_decoder,
        labels,
        args):
        """performs inference / evaluation
        Args:
            data_layer: data layer object that holds data loader
            audio_processor: data processing module
            greedy_decoder: greedy decoder
            labels: list of labels as output vocabulary
            args: script input arguments
        """
        start_t = time.time()
        if args.mode==1 or args.mode==2:
                rnnt_hw_model = RNNT_infer_model()
        else:
                rnnt_hw_model = None
        
        logits_save_to = args.logits_save_to
        with torch.no_grad():
                _global_var_dict = {
                        'predictions': [],
                        'transcripts': [],
                        'logits': [],
                }

        Processnum = []
        for it, data in enumerate(data_layer.data_iterator):
            if args.mode == 3:
                (t_audio_signal_e, t_a_sig_length_e,
                    transcript_list, t_transcript_e,
                    t_transcript_len_e) = audio_processor(data)
                h_rnns =(None,None)
                label=[]
                hidden = None
                #greedy decode on cpu                
                t_transcript_e = torch.nn.utils.rnn.pad_packed_sequence(t_transcript_e, batch_first=True)[0]            
                t_predictions_e, h_pre_rnns, hidden_predict, decode_batch_length = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, h_rnns,label,hidden, None)
            else:
                Process_ver = MyProcess(data, audio_processor, _global_var_dict, labels, ver_Process,rnnt_hw_model,greedy_decoder)
                Process_ver.start()
                Processnum.append(it)
                
                if args.steps is not None and it + 1 >= args.steps:
                    break
        if args.mode !=3:
            for id in Processnum:
                Process_ver.join()
        else:
            values_dict = dict(
                    predictions=[t_predictions_e],
                    transcript=transcript_list,
                    transcript_length=t_transcript_len_e,
            )
            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

        wer = process_evaluation_epoch(_global_var_dict)
        print("=================>Evaluation WER: {0}\n".format(wer))
        if args.save_prediction is not None:
            with open(args.save_prediction, 'w') as fp:
                fp.write('\n'.join(_global_var_dict['predictions']))
        
        end_t =time.time()
        if args.mode == 1:
            print('dpu computation time (lstm_run time)', sum(lstm_run_time_t))
        print('e2e decode time:',end_t-start_t)
Example #10
0
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels,
         args):
    """performs evaluation and prints performance statistics
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        args: script input arguments
    """
    batch_size = args.batch_size
    steps = args.steps
    audio_processor.eval()
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
        }

        it = 0
        ep = 0

        if steps is None:
            steps = math.ceil(len(data_layer) / batch_size)
        durations_dnn = []
        durations_dnn_and_prep = []
        seq_lens = []
        while True:
            ep += 1
            for data in tqdm(data_layer.data_iterator):
                it += 1
                if it > steps:
                    break
                tensors = []
                dl_device = torch.device("cuda")
                for d in data:
                    tensors.append(d.to(dl_device))

                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                inp = (t_audio_signal_e, t_a_sig_length_e)
                torch.cuda.synchronize()
                t0 = time.perf_counter()
                t_processed_signal, p_length_e = audio_processor(x=inp)
                torch.cuda.synchronize()
                t1 = time.perf_counter()
                t_log_probs_e, _ = encoderdecoder(
                    (t_processed_signal, p_length_e))
                torch.cuda.synchronize()
                stop_time = time.perf_counter()

                time_prep_and_dnn = stop_time - t0
                time_dnn = stop_time - t1
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e],
                )
                process_evaluation_batch(values_dict,
                                         _global_var_dict,
                                         labels=labels)
                durations_dnn.append(time_dnn)
                durations_dnn_and_prep.append(time_prep_and_dnn)
                seq_lens.append(t_processed_signal.shape[-1])

            if it >= steps:

                wer, _ = process_evaluation_epoch(_global_var_dict)
                print(
                    "==========>>>>>>Evaluation of all iterations WER: {0}\n".
                    format(wer))
                break

        ratios = [0.9, 0.95, 0.99, 1.]
        latencies_dnn = take_durations_and_output_percentile(
            durations_dnn, ratios)
        latencies_dnn_and_prep = take_durations_and_output_percentile(
            durations_dnn_and_prep, ratios)
        print("\n using batch size {} and {} frames ".format(
            batch_size, seq_lens[-1]))
        print("\n".join([
            "dnn latency {} : {} ".format(k, v)
            for k, v in latencies_dnn.items()
        ]))
        print("\n".join([
            "prep + dnn latency {} : {} ".format(k, v)
            for k, v in latencies_dnn_and_prep.items()
        ]))
Example #11
0
def eval(
        data_layer,
        audio_processor,
        encoderdecoder,
        greedy_decoder,
        labels,
        multi_gpu,
        args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        multi_gpu: true if using multiple gpus
        args: script input arguments
    """
    if args.ipex:
        import intel_extension_for_pytorch as ipex

    logits_save_to=args.logits_save_to
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits' : [],
        }

        if args.wav:
            # TODO unimplemented in ipex
            assert False, "wav unsupported in ipex for now"
            features, p_length_e = audio_processor(audio_from_file(args.wav))
            # torch.cuda.synchronize()
            t0 = time.perf_counter()
            t_log_probs_e = encoderdecoder(features)
            # torch.cuda.synchronize()
            t1 = time.perf_counter()
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
            print("TRANSCRIPT\t\t:", hypotheses[0])
            return
        steps_per_epoch = len(data_layer)
        total_steps = args.steps if args.steps is not None else steps_per_epoch
        test_epoches = int(total_steps / steps_per_epoch)
        print('Evaluating RNNT: Steps per Epoch {} total Steps {}'.format(steps_per_epoch, total_steps))

        # Int8 Calibration
        if args.ipex and args.int8 and args.calibration:
            print("runing int8 calibration step\n")
            conf = ipex.AmpConf(torch.int8)            
            for it, data in enumerate(tqdm(data_layer.data_iterator)):
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                
                t_predictions_e, conf = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)

                if args.steps is not None and it + 1 >= args.steps:
                    break
            conf.save(args.configure_dir)
        # Inference (vanilla cpu, dnnl fp32 or dnnl int8)
        else:
            if not args.ipex:
                if args.warm_up > 0:
                    print("\nstart warm up, warmp_up steps = ", args.warm_up)
                    for it, data in enumerate(tqdm(data_layer.data_iterator)):
                            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                            conf = None
                            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                            
                            if it + 1 >= args.warm_up:
                                break
                print("\nstart measure performance, measure steps = ", total_steps)
                total_time = 0
                with tqdm(total=total_steps) as pbar:
                    for epoch in range(test_epoches + 1):
                        for it, data in enumerate(data_layer.data_iterator):
                            if  epoch * steps_per_epoch + it >= total_steps:
                                break
                            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                            if args.profiling:
                                # with torch.autograd.profiler.profile(args.profiling) as prof:
                                with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                                    conf = None
                                    t0 = time.perf_counter()
                                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                    t1 = time.perf_counter()
                            else:
                                conf = None
                                t0 = time.perf_counter()
                                t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                t1 = time.perf_counter()

                            total_time += (t1 - t0)

                            values_dict = dict(
                                predictions=[t_predictions_e],
                                transcript=[t_transcript_e],
                                transcript_length=[t_transcript_len_e],
                            )
                            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                            pbar.update(1)
            else:
                if args.mix_precision:
                    with torch.cpu.amp.autocast():
                        # warm up
                        if args.warm_up > 0:
                            print("\nstart warm up, warmp_up steps = ", args.warm_up)
                            for it, data in enumerate(tqdm(data_layer.data_iterator)):
                                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                                conf = None
                                t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                
                                if it + 1 >= args.warm_up:
                                    break

                        # measure performance
                        print("\nstart measure performance, measure steps = ", total_steps)
                        total_time = 0
                        # with torch.autograd.profiler.profile(args.profiling) as prof:
                        with tqdm(total=total_steps) as pbar:
                            for epoch in range(test_epoches + 1):
                                for it, data in enumerate(data_layer.data_iterator):
                                    if epoch * steps_per_epoch + it >= total_steps:
                                        break
                                    t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                                    if args.profiling:
                                        # with torch.autograd.profiler.profile(args.profiling) as prof:
                                        with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                                            conf = None
                                            t0 = time.perf_counter()
                                            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                            t1 = time.perf_counter()
                                    else:
                                        conf = None
                                        t0 = time.perf_counter()
                                        t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                        t1 = time.perf_counter()

                                    total_time += (t1 - t0)

                                    values_dict = dict(
                                        predictions=[t_predictions_e],
                                        transcript=[t_transcript_e],
                                        transcript_length=[t_transcript_len_e],
                                    )
                                    process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                                    pbar.update(1)
                else:
                    # warm up
                    if args.warm_up > 0:
                        print("\nstart warm up, warmp_up steps = ", args.warm_up)
                        for it, data in enumerate(tqdm(data_layer.data_iterator)):
                            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                            conf = None
                            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                            
                            if it + 1 >= args.warm_up:
                                break

                    # measure performance
                    print("\nstart measure performance, measure steps = ", total_steps)
                    total_time = 0
                    # with torch.autograd.profiler.profile(args.profiling) as prof:
                    with tqdm(total=total_steps) as pbar:
                        for epoch in range(test_epoches + 1):
                            for it, data in enumerate(data_layer.data_iterator):
                                if epoch * steps_per_epoch + it >= total_steps:
                                    break
                                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                                if args.profiling:
                                    # with torch.autograd.profiler.profile(args.profiling) as prof:
                                    with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                                        conf = None
                                        t0 = time.perf_counter()
                                        t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                        t1 = time.perf_counter()
                                else:
                                    conf = None
                                    t0 = time.perf_counter()
                                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                    t1 = time.perf_counter()

                                total_time += (t1 - t0)

                                values_dict = dict(
                                    predictions=[t_predictions_e],
                                    transcript=[t_transcript_e],
                                    transcript_length=[t_transcript_len_e],
                                )
                                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                                pbar.update(1)

            if args.print_result:
                hypotheses = _global_var_dict['predictions']
                references = _global_var_dict['transcripts']

                nb = len(hypotheses)
                print("print %d sample results: " % (min(len(hypotheses), nb)))
                for i, item in enumerate(hypotheses):
                    print("hyp: ", hypotheses[i])
                    print("ref: ", references[i])
                    print()
                    if i > nb:
                        break
            
            if args.profiling:
                # print(prof.key_averages().table(sort_by="cpu_time_total"))
                print(prof.key_averages().table(sort_by="self_cpu_time_total"))

            wer, _ = process_evaluation_epoch(_global_var_dict)
            if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
                print("\n=========================>>>>>>")
                print("Evaluation WER: {0}".format(wer))
                print("Accuracy: {:.15f} ".format(1 - wer))
                if args.save_prediction is not None:
                    with open(args.save_prediction, 'w') as fp:
                        fp.write('\n'.join(_global_var_dict['predictions']))
                if logits_save_to is not None:
                    logits = []
                    for batch in _global_var_dict["logits"]:
                        for i in range(batch.shape[0]):
                            logits.append(batch[i].cpu().numpy())
                    with open(logits_save_to, 'wb') as f:
                        pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)

            if args.steps:
                total_samples = args.steps * args.batch_size
            else:
                total_samples = len(data_layer)

            print("total samples tested: ", total_samples)
            print("total time (encoder + decoder, excluded audio processing): ", total_time, "s")
            print("dataset size: ", len(data_layer))

            perf = total_samples / total_time

            print("Throughput: {:.3f} fps".format(perf))
Example #12
0
def eval(
        data_layer,
        audio_processor,
        encoderdecoder,
        greedy_decoder,
        labels,
        multi_gpu,
        args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        multi_gpu: true if using multiple gpus
        args: script input arguments
    """
    logits_save_to=args.logits_save_to
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits' : [],
        }


        
        if args.wav:
            features, p_length_e = audio_processor(audio_from_file(args.wav))
            torch.cuda.synchronize()
            t0 = time.perf_counter()
            t_log_probs_e = encoderdecoder(features)
            torch.cuda.synchronize()
            t1 = time.perf_counter()
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
            print("TRANSCRIPT\t\t:", hypotheses[0])
            return
        
        for it, data in enumerate(tqdm(data_layer.data_iterator)):
            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)

            t_log_probs_e, (x_len, y_len) = encoderdecoder(
                    ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
            )
            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e)

            values_dict = dict(
                predictions=[t_predictions_e],
                transcript=[t_transcript_e],
                transcript_length=[t_transcript_len_e],
                output=[t_log_probs_e]
            )
            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

            if args.steps is not None and it + 1 >= args.steps:
                break
        wer, _ = process_evaluation_epoch(_global_var_dict)
        if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
            print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
            if args.save_prediction is not None:
                with open(args.save_prediction, 'w') as fp:
                    fp.write('\n'.join(_global_var_dict['predictions']))
            if logits_save_to is not None:
                logits = []
                for batch in _global_var_dict["logits"]:
                    for i in range(batch.shape[0]):
                        logits.append(batch[i].cpu().numpy())
                with open(logits_save_to, 'wb') as f:
                    pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
Example #13
0
    def evalutaion(epoch=0):
        model.eval()
        if args.ipex:
            if args.bf16:
                print("running bfloat16 evaluation step\n")
            else:
                print("running fp32 evaluation step\n")

        for dataset, frequency, name in eval_datasets:
            if epoch % frequency != 0:
                continue

            print_once(f"Doing {name} ....................... ......  ... .. . .")

            with torch.no_grad():
                _global_var_dict = {
                    'EvalLoss': [],
                    'predictions': [],
                    'transcripts': [],
                }
                dataloader = dataset.data_iterator
                for data in dataloader:
                    t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = data_transforms(data)

                    if args.ipex:
                        if args.bf16:
                            with torch.cpu.amp.autocast():
                                t_log_probs_t, (x_len, y_len) = model(
                                    ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                                )
                        elif args.fp32:
                            t_log_probs_e, (x_len, y_len) = model(
                                ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
                            )
                    else:
                        t_log_probs_e, (x_len, y_len) = model(
                            ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
                        )
                    t_loss_e = loss_fn(
                        (t_log_probs_e, x_len), (t_transcript_e, y_len)
                    )
                    print(t_loss_e)
                    del t_log_probs_e

                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e)

                    values_dict = dict(
                        loss=[t_loss_e],
                        predictions=[t_predictions_e],
                        transcript=[t_transcript_e],
                        transcript_length=[t_transcript_len_e]
                    )
                    process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                # final aggregation across all workers and minibatches) and logging of results
                wer, eloss = process_evaluation_epoch(_global_var_dict)
                logger.log_scalar('loss', eloss, epoch, name)
                logger.log_scalar('wer', wer, epoch, name)

                print_once(f"==========>>>>>>{name} Loss: {eloss}\n")
                print_once(f"==========>>>>>>{name} WER: {wer}\n")
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels,
         device, args):
    """performs evaluation and prints performance statistics
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        args: script input arguments
    """
    batch_size = args.batch_size
    steps = args.steps
    audio_processor.eval()
    encoderdecoder.eval()
    greedy_decoder.eval()

    if args.torch_script:
        audio, audio_len = audio_from_file(args.sample_audio, device=device)
        audio_processor, encoderdecoder, greedy_decoder = jit_export(
            audio, audio_len, audio_processor, encoderdecoder, greedy_decoder,
            args)

    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
        }

        it = 0
        ep = 0

        if steps is None:
            steps = math.ceil(len(data_layer) / batch_size)
        durations_dnn = []
        durations_dnn_and_prep = []
        seq_lens = []

        sync = lambda: torch.cuda.synchronize(
        ) if device.type == 'cuda' else None

        while True:
            ep += 1
            for data in tqdm(data_layer.data_iterator):
                it += 1
                if it > steps:
                    break
                tensors = [t.to(device) for t in data]

                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                sync()
                t0 = time.perf_counter()
                features, lens = audio_processor(t_audio_signal_e,
                                                 t_a_sig_length_e)

                sync()
                t1 = time.perf_counter()
                if isinstance(encoderdecoder, torch.jit.TracedModule):
                    t_log_probs_e = encoderdecoder(features)
                else:
                    t_log_probs_e, _ = encoderdecoder.infer((features, lens))

                sync()
                stop_time = time.perf_counter()
                time_prep_and_dnn = stop_time - t0
                time_dnn = stop_time - t1
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e],
                )
                process_evaluation_batch(values_dict,
                                         _global_var_dict,
                                         labels=labels)
                durations_dnn.append(time_dnn)
                durations_dnn_and_prep.append(time_prep_and_dnn)
                seq_lens.append(features[0].shape[-1])

            if it >= steps:

                wer, _ = process_evaluation_epoch(_global_var_dict)
                print(
                    "==========>>>>>>Evaluation of all iterations WER: {0}\n".
                    format(wer))
                break

        ratios = [0.9, 0.95, 0.99, 1.]
        latencies_dnn = take_durations_and_output_percentile(
            durations_dnn, ratios)
        latencies_dnn_and_prep = take_durations_and_output_percentile(
            durations_dnn_and_prep, ratios)
        print("\n using batch size {} and {} frames ".format(
            batch_size, seq_lens[-1]))
        print("\n".join([
            "dnn latency {} : {} ".format(k, v)
            for k, v in latencies_dnn.items()
        ]))
        print("\n".join([
            "prep + dnn latency {} : {} ".format(k, v)
            for k, v in latencies_dnn_and_prep.items()
        ]))