def run_performance(model_setting, test_setting, perf_results, test_all): input_ids, segment_ids, input_mask = get_bert_inputs( model_setting.model_path, model_setting.input_ids_name, model_setting.segment_ids_name, model_setting.input_mask_name) # Do not generate random mask for performance test. print( f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}" ) all_inputs = generate_test_data(test_setting.batch_size, test_setting.sequence_length, test_setting.test_cases, test_setting.seed, test_setting.verbose, input_ids, segment_ids, input_mask, random_mask_length=False) if test_setting.contiguous: all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs) print( "Extra latency for converting inputs to contiguous: {} ms".format( format(contiguous_latency, '.2f'))) test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0 run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs)
def create_bert_inputs(onnx_model, batch_size, sequence_length, samples, input_ids_name=None, segment_ids_name=None, input_mask_name=None): """Create dummy inputs for BERT model. Args: onnx_model (OnnxModel): ONNX model batch_size (int): batch size sequence_length (int): sequence length samples (int): number of samples input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None. segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None. input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None. Returns: List[Dict]: list of inputs """ from bert_test_data import find_bert_inputs, generate_test_data input_ids, segment_ids, input_mask = find_bert_inputs( onnx_model, input_ids_name, segment_ids_name, input_mask_name) all_inputs = generate_test_data(batch_size, sequence_length, test_cases=samples, seed=123, verbose=False, input_ids=input_ids, segment_ids=segment_ids, input_mask=input_mask, random_mask_length=False) return all_inputs
def run_performance(model_setting, test_setting, perf_results): input_ids, segment_ids, input_mask = get_bert_inputs( model_setting.model_path, model_setting.input_ids_name, model_setting.segment_ids_name, model_setting.input_mask_name, ) # Do not generate random mask for performance test. print( f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}" ) all_inputs = generate_test_data( test_setting.batch_size, test_setting.sequence_length, test_setting.test_cases, test_setting.seed, test_setting.verbose, input_ids, segment_ids, input_mask, random_mask_length=False, ) run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
def run_performance(perf_results, model_path, batch_size, sequence_length, use_gpu, test_cases, test_times, seed, verbose, inclusive, test_all, no_warmup, opt_level): # Try deduce input names from model. input_ids, segment_ids, input_mask = get_bert_inputs(model_path) # Do not generate random mask for performance test. print( f"Generating {test_cases} samples for batch_size={batch_size} sequence_length={sequence_length}" ) all_inputs = generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, input_ids, segment_ids, input_mask, random_mask_length=False) contiguous = False run_perf_tests(perf_results, model_path, batch_size, sequence_length, use_gpu, test_cases, test_times, contiguous, all_inputs, test_all, no_warmup, opt_level, extra_latency=0) # only test contiguous array when the --all flag is set. if not test_all: return # Convert inputs to contiguous array, which could improve inference performance all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs) print("Extra latency for converting inputs to contiguous: {} ms".format( format(contiguous_latency, '.2f'))) contiguous = True run_perf_tests(perf_results, model_path, batch_size, sequence_length, use_gpu, test_cases, test_times, contiguous, all_inputs, test_all, no_warmup, opt_level, extra_latency=contiguous_latency if inclusive else 0)
def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed, use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name): # Try deduce input names from optimized model. input_ids, segment_ids, input_mask = get_bert_inputs( optimized_model, input_ids_name, segment_ids_name, input_mask_name) # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script. all_inputs = generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, input_ids, segment_ids, input_mask, random_mask_length=True) # OpenMP environment variables must be set before the very first "import onnxruntime" if use_openmp: setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE') else: setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE') baseline_results, baseline_latency, output_names = run_model( baseline_model, all_inputs, use_gpu, use_openmp, disable_optimization=True) if verbose: print("baseline average latency (all optimizations disabled): {} ms". format(statistics.mean(baseline_latency) * 1000)) if output_dir is not None: for i, inputs in enumerate(all_inputs): output_test_data(output_dir, i, inputs) treatment_results, treatment_latency, treatment_output_names = run_model( optimized_model, all_inputs, use_gpu, use_openmp, disable_optimization=False) if verbose: print("treatment average latency: {} ms".format( statistics.mean(treatment_latency) * 1000)) # Validate the output of baseline and treatment, to make sure the results are similar. compare(baseline_results, treatment_results, verbose, rtol, atol)
def run_test( baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name, ): # Try deduce input names from optimized model. input_ids, segment_ids, input_mask = get_bert_inputs( optimized_model, input_ids_name, segment_ids_name, input_mask_name) # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script. all_inputs = generate_test_data( batch_size, sequence_length, test_cases, seed, verbose, input_ids, segment_ids, input_mask, random_mask_length=True, ) baseline_results, baseline_latency, output_names = run_model( baseline_model, all_inputs, use_gpu, disable_optimization=True) if verbose: print("baseline average latency (all optimizations disabled): {} ms". format(statistics.mean(baseline_latency) * 1000)) if output_dir is not None: for i, inputs in enumerate(all_inputs): output_test_data(output_dir, i, inputs) treatment_results, treatment_latency, treatment_output_names = run_model( optimized_model, all_inputs, use_gpu, disable_optimization=False) if verbose: print("treatment average latency: {} ms".format( statistics.mean(treatment_latency) * 1000)) # Validate the output of baseline and treatment, to make sure the results are similar. compare(baseline_results, treatment_results, verbose, rtol, atol)
def create_bert_inputs(model, batch_size, sequence_length, samples, input_ids_name, segment_ids_name, input_mask_name): from bert_test_data import get_bert_inputs, generate_test_data input_ids, segment_ids, input_mask = get_bert_inputs( model, input_ids_name, segment_ids_name, input_mask_name) all_inputs = generate_test_data(batch_size, sequence_length, test_cases=samples, seed=123, verbose=False, input_ids=input_ids, segment_ids=segment_ids, input_mask=input_mask, random_mask_length=False) return all_inputs
def run_performance(average_latency, model_path, batch_size, sequence_length, use_gpu, test_cases, test_times, seed, verbose, run_all_settings): # Try deduce input names from model. input_ids, segment_ids, input_mask = get_bert_inputs(model_path) # Do not generate random mask for performance test. print("generating test data...") all_inputs = generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, input_ids, segment_ids, input_mask, random_mask_length=False) contiguous = False if run_all_settings: run_perf_tests(average_latency, model_path, batch_size, sequence_length, use_gpu, test_cases, test_times, seed, verbose, contiguous, input_ids, segment_ids, input_mask, all_inputs, run_all_settings) # Convert inputs to contiguous array, which could improve inference performance all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs) print("Extra latency for converting inputs to contiguous: {} ms".format( format(contiguous_latency, '.2f'))) contiguous = True run_perf_tests(average_latency, model_path, batch_size, sequence_length, use_gpu, test_cases, test_times, seed, verbose, contiguous, input_ids, segment_ids, input_mask, all_inputs, run_all_settings) return contiguous_latency