Beispiel #1
0
def run_performance(model_setting, test_setting, perf_results, test_all):
    input_ids, segment_ids, input_mask = get_bert_inputs(
        model_setting.model_path, model_setting.input_ids_name,
        model_setting.segment_ids_name, model_setting.input_mask_name)

    # Do not generate random mask for performance test.
    print(
        f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}"
    )
    all_inputs = generate_test_data(test_setting.batch_size,
                                    test_setting.sequence_length,
                                    test_setting.test_cases,
                                    test_setting.seed,
                                    test_setting.verbose,
                                    input_ids,
                                    segment_ids,
                                    input_mask,
                                    random_mask_length=False)
    if test_setting.contiguous:
        all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
        print(
            "Extra latency for converting inputs to contiguous: {} ms".format(
                format(contiguous_latency, '.2f')))
        test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0

    run_perf_tests(model_setting, test_setting, perf_results, test_all,
                   all_inputs)
Beispiel #2
0
def create_bert_inputs(onnx_model,
                       batch_size,
                       sequence_length,
                       samples,
                       input_ids_name=None,
                       segment_ids_name=None,
                       input_mask_name=None):
    """Create dummy inputs for BERT model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples
        input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
        segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
        input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

    Returns:
        List[Dict]: list of inputs
    """
    from bert_test_data import find_bert_inputs, generate_test_data
    input_ids, segment_ids, input_mask = find_bert_inputs(
        onnx_model, input_ids_name, segment_ids_name, input_mask_name)
    all_inputs = generate_test_data(batch_size,
                                    sequence_length,
                                    test_cases=samples,
                                    seed=123,
                                    verbose=False,
                                    input_ids=input_ids,
                                    segment_ids=segment_ids,
                                    input_mask=input_mask,
                                    random_mask_length=False)

    return all_inputs
Beispiel #3
0
def run_performance(model_setting, test_setting, perf_results):
    input_ids, segment_ids, input_mask = get_bert_inputs(
        model_setting.model_path,
        model_setting.input_ids_name,
        model_setting.segment_ids_name,
        model_setting.input_mask_name,
    )

    # Do not generate random mask for performance test.
    print(
        f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}"
    )
    all_inputs = generate_test_data(
        test_setting.batch_size,
        test_setting.sequence_length,
        test_setting.test_cases,
        test_setting.seed,
        test_setting.verbose,
        input_ids,
        segment_ids,
        input_mask,
        random_mask_length=False,
    )

    run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
Beispiel #4
0
def run_performance(perf_results, model_path, batch_size, sequence_length,
                    use_gpu, test_cases, test_times, seed, verbose, inclusive,
                    test_all, no_warmup, opt_level):
    # Try deduce input names from model.
    input_ids, segment_ids, input_mask = get_bert_inputs(model_path)

    # Do not generate random mask for performance test.
    print(
        f"Generating {test_cases} samples for batch_size={batch_size} sequence_length={sequence_length}"
    )
    all_inputs = generate_test_data(batch_size,
                                    sequence_length,
                                    test_cases,
                                    seed,
                                    verbose,
                                    input_ids,
                                    segment_ids,
                                    input_mask,
                                    random_mask_length=False)

    contiguous = False
    run_perf_tests(perf_results,
                   model_path,
                   batch_size,
                   sequence_length,
                   use_gpu,
                   test_cases,
                   test_times,
                   contiguous,
                   all_inputs,
                   test_all,
                   no_warmup,
                   opt_level,
                   extra_latency=0)

    # only test contiguous array when the --all flag is set.
    if not test_all:
        return

    # Convert inputs to contiguous array, which could improve inference performance
    all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
    print("Extra latency for converting inputs to contiguous: {} ms".format(
        format(contiguous_latency, '.2f')))

    contiguous = True
    run_perf_tests(perf_results,
                   model_path,
                   batch_size,
                   sequence_length,
                   use_gpu,
                   test_cases,
                   test_times,
                   contiguous,
                   all_inputs,
                   test_all,
                   no_warmup,
                   opt_level,
                   extra_latency=contiguous_latency if inclusive else 0)
Beispiel #5
0
def run_test(baseline_model, optimized_model, output_dir, batch_size,
             sequence_length, use_gpu, test_cases, seed, use_openmp, verbose,
             rtol, atol, input_ids_name, segment_ids_name, input_mask_name):

    # Try deduce input names from optimized model.
    input_ids, segment_ids, input_mask = get_bert_inputs(
        optimized_model, input_ids_name, segment_ids_name, input_mask_name)

    # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
    all_inputs = generate_test_data(batch_size,
                                    sequence_length,
                                    test_cases,
                                    seed,
                                    verbose,
                                    input_ids,
                                    segment_ids,
                                    input_mask,
                                    random_mask_length=True)

    # OpenMP environment variables must be set before the very first "import onnxruntime"
    if use_openmp:
        setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False),
                             omp_wait_policy='ACTIVE')
    else:
        setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE')

    baseline_results, baseline_latency, output_names = run_model(
        baseline_model,
        all_inputs,
        use_gpu,
        use_openmp,
        disable_optimization=True)
    if verbose:
        print("baseline average latency (all optimizations disabled): {} ms".
              format(statistics.mean(baseline_latency) * 1000))

    if output_dir is not None:
        for i, inputs in enumerate(all_inputs):
            output_test_data(output_dir, i, inputs)

    treatment_results, treatment_latency, treatment_output_names = run_model(
        optimized_model,
        all_inputs,
        use_gpu,
        use_openmp,
        disable_optimization=False)
    if verbose:
        print("treatment average latency: {} ms".format(
            statistics.mean(treatment_latency) * 1000))

    # Validate the output of baseline and treatment, to make sure the results are similar.
    compare(baseline_results, treatment_results, verbose, rtol, atol)
Beispiel #6
0
def run_test(
    baseline_model,
    optimized_model,
    output_dir,
    batch_size,
    sequence_length,
    use_gpu,
    test_cases,
    seed,
    verbose,
    rtol,
    atol,
    input_ids_name,
    segment_ids_name,
    input_mask_name,
):

    # Try deduce input names from optimized model.
    input_ids, segment_ids, input_mask = get_bert_inputs(
        optimized_model, input_ids_name, segment_ids_name, input_mask_name)

    # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
    all_inputs = generate_test_data(
        batch_size,
        sequence_length,
        test_cases,
        seed,
        verbose,
        input_ids,
        segment_ids,
        input_mask,
        random_mask_length=True,
    )

    baseline_results, baseline_latency, output_names = run_model(
        baseline_model, all_inputs, use_gpu, disable_optimization=True)
    if verbose:
        print("baseline average latency (all optimizations disabled): {} ms".
              format(statistics.mean(baseline_latency) * 1000))

    if output_dir is not None:
        for i, inputs in enumerate(all_inputs):
            output_test_data(output_dir, i, inputs)

    treatment_results, treatment_latency, treatment_output_names = run_model(
        optimized_model, all_inputs, use_gpu, disable_optimization=False)
    if verbose:
        print("treatment average latency: {} ms".format(
            statistics.mean(treatment_latency) * 1000))

    # Validate the output of baseline and treatment, to make sure the results are similar.
    compare(baseline_results, treatment_results, verbose, rtol, atol)
Beispiel #7
0
def create_bert_inputs(model, batch_size, sequence_length, samples,
                       input_ids_name, segment_ids_name, input_mask_name):
    from bert_test_data import get_bert_inputs, generate_test_data
    input_ids, segment_ids, input_mask = get_bert_inputs(
        model, input_ids_name, segment_ids_name, input_mask_name)
    all_inputs = generate_test_data(batch_size,
                                    sequence_length,
                                    test_cases=samples,
                                    seed=123,
                                    verbose=False,
                                    input_ids=input_ids,
                                    segment_ids=segment_ids,
                                    input_mask=input_mask,
                                    random_mask_length=False)

    return all_inputs
Beispiel #8
0
def run_performance(average_latency, model_path, batch_size, sequence_length,
                    use_gpu, test_cases, test_times, seed, verbose,
                    run_all_settings):
    # Try deduce input names from model.
    input_ids, segment_ids, input_mask = get_bert_inputs(model_path)

    # Do not generate random mask for performance test.
    print("generating test data...")
    all_inputs = generate_test_data(batch_size,
                                    sequence_length,
                                    test_cases,
                                    seed,
                                    verbose,
                                    input_ids,
                                    segment_ids,
                                    input_mask,
                                    random_mask_length=False)

    contiguous = False
    if run_all_settings:
        run_perf_tests(average_latency, model_path, batch_size,
                       sequence_length, use_gpu, test_cases, test_times, seed,
                       verbose, contiguous, input_ids, segment_ids, input_mask,
                       all_inputs, run_all_settings)

    # Convert inputs to contiguous array, which could improve inference performance
    all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
    print("Extra latency for converting inputs to contiguous: {} ms".format(
        format(contiguous_latency, '.2f')))

    contiguous = True
    run_perf_tests(average_latency, model_path, batch_size, sequence_length,
                   use_gpu, test_cases, test_times, seed, verbose, contiguous,
                   input_ids, segment_ids, input_mask, all_inputs,
                   run_all_settings)

    return contiguous_latency