def _test_optimizer_on_tf_model(self, model_name, expected_fusion_result_list, inputs_count, validate_model=True): # expect fusion result list have the following keys # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization model_fusion_statistics = {} from onnx_exporter import export_onnx_model_from_tf from huggingface_models import MODELS from benchmark_helper import Precision print("testing mode ", model_name) print("testing input number = ", inputs_count) input_names = MODELS[model_name][0] import torch with torch.no_grad(): _, is_valid_onnx_model, _, _ = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], None, './cache_models', './onnx_models', input_names[:inputs_count], False, Precision.FLOAT32, True, True, True, True, model_fusion_statistics) onnx_model = list(model_fusion_statistics.keys())[0] fusion_result_list = list(model_fusion_statistics[onnx_model].values()) if validate_model: self.assertEqual(is_valid_onnx_model, True) self.assertEqual(fusion_result_list, expected_fusion_result_list)
def _test_optimizer_on_tf_model(self, model_name, expected_fusion_result_list, inputs_count, validate_model=True): # Remove cached model so that CI machine will have space import shutil shutil.rmtree('./cache_models', ignore_errors=True) shutil.rmtree('./onnx_models', ignore_errors=True) # expect fusion result list have the following keys # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization model_fusion_statistics = {} print("testing mode ", model_name) print("testing input number = ", inputs_count) input_names = MODELS[model_name][0] import torch with torch.no_grad(): _, is_valid_onnx_model, _, _ = export_onnx_model_from_tf(model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], None, './cache_models', './onnx_models', input_names[:inputs_count], False, Precision.FLOAT32, True, True, True, True, model_fusion_statistics) onnx_model = list(model_fusion_statistics.keys())[0] fusion_result_list = list(model_fusion_statistics[onnx_model].values()) if validate_model: self.assertEqual(is_valid_onnx_model, True) self.assertEqual(fusion_result_list, expected_fusion_result_list)
def _test_optimizer_on_tf_model(self, model_name, expected_fusion_result_list, inputs_count, validate_model=True): # Remove cached model so that CI machine will have space shutil.rmtree("./cache_models", ignore_errors=True) shutil.rmtree("./onnx_models", ignore_errors=True) # expect fusion result list have the following keys # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization model_fusion_statistics = {} print("testing mode ", model_name) print("testing input number = ", inputs_count) input_names = MODELS[model_name][0] config_modifier = ConfigModifier(None) fusion_options = None model_class = "AutoModel" with torch.no_grad(): _, is_valid_onnx_model, _, _ = export_onnx_model_from_tf( model_name, MODELS[model_name][1], # opset version MODELS[model_name][2], # use_external_data_format MODELS[model_name][3], # optimization model model_class, config_modifier, "./cache_models", "./onnx_models", input_names[:inputs_count], False, Precision.FLOAT32, True, True, True, True, model_fusion_statistics, fusion_options, ) onnx_model = list(model_fusion_statistics.keys())[0] fusion_result_list = list(model_fusion_statistics[onnx_model].values()) if validate_model: self.assertEqual(is_valid_onnx_model, True) self.assertEqual(fusion_result_list, expected_fusion_result_list)
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source): import onnxruntime results = [] if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): logger.warning( "Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] if 'pt' in model_source: with torch.no_grad(): onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) if 'tf' in model_source: onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, enable_all_optimization=True, num_threads=num_threads, verbose=verbose) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = {"last_state": None, "pooler": None} device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size) ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, input_value_type) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "device": device, "optimizer": optimize_onnx, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) data_type = numpy.longlong if 'pt' in model_source else numpy.int32 result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, max_last_state_size, max_pooler_size, batch_size, device, data_type) logger.info(result) results.append(result) return results
def run_onnxruntime( use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source, args, ): import onnxruntime results = [] if (use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()) and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results warm_up_repeat = 0 if provider == "tensorrt": optimizer_info = OptimizerInfo.NOOPT warm_up_repeat = 5 if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers( ): logger.error( "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance." ) return results if optimizer_info == OptimizerInfo.NOOPT: logger.warning( f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] args.model_type = MODELS[model_name][3] fusion_options = FusionOptions.parse(args) if "pt" in model_source: with torch.no_grad(): ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if "tf" in model_source: ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, provider, enable_all_optimization=True, num_threads=num_threads, verbose=verbose, ) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = [] device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size), ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if "pt" in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, config, input_value_type, ) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "providers": provider, "device": device, "optimizer": optimizer_info, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "custom_layer_num": config_modifier.get_layer_num(), "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort( ort_session, ort_inputs, result_template, repeat_times, batch_size, warm_up_repeat, ) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) output_buffer_max_sizes = [max_last_state_size] for i in range(len(ort_outputs)): if i == 2 and MODELS[model_name][3] == "gpt": # past state output max size output_buffer_max_sizes.append(max_pooler_size) else: output_buffer_max_sizes.append( max_last_state_size) data_type = numpy.longlong if "pt" in model_source else numpy.intc result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, output_buffer_max_sizes, batch_size, device, data_type, warm_up_repeat, ) logger.info(result) results.append(result) return results