def main(): args = _parse_arguments() _setup_logger(args.verbose) logger.debug(f"arguments:{args}") if os.path.realpath(args.input) == os.path.realpath(args.output): logger.warning( f"Specified the same input and output path. Note that this may overwrite the original model" ) optimization_options = FusionOptions.parse(args) optimizer = optimize_model(args.input, args.model_type, args.num_heads, args.hidden_size, opt_level=args.opt_level, optimization_options=optimization_options, use_gpu=args.use_gpu, only_onnxruntime=args.only_onnxruntime) if args.float16: optimizer.convert_float_to_float16(keep_io_types=True) if args.input_int32: optimizer.change_graph_inputs_to_int32() optimizer.save_model_to_file(args.output, args.use_external_data_format) if optimizer.is_fully_optimized(): logger.info("The model has been fully optimized.") else: logger.info("The model has been optimized.")
def optimize_onnx(onnx_model_path, optimized_model_path, is_float16, num_attention_heads, hidden_size, use_external_data_format=False, **kwargs): """ Optimize ONNX model with an option to convert it to use mixed precision. """ from optimizer import optimize_model from fusion_options import FusionOptions optimization_options = FusionOptions('gpt2') #optimization_options.enable_gelu = False #optimization_options.enable_layer_norm = False #optimization_options.enable_attention = False m = optimize_model(onnx_model_path, model_type='gpt2', num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0, optimization_options=optimization_options, use_gpu=False) if is_float16: op_full_list = set([node.op_type for node in m.nodes()]) op_block_list = set(kwargs["op_block_list"] ) if "op_block_list" in kwargs else set() op_remain_list = op_full_list.difference(op_block_list) logger.info( f"op_block_list={op_block_list} op_remain_list={op_remain_list}" ) m.convert_float_to_float16(use_symbolic_shape_infer=True, **kwargs) m.save_model_to_file(optimized_model_path, use_external_data_format)
def optimize_onnx(onnx_model_path, optimized_model_path, is_float16, num_attention_heads, hidden_size, use_external_data_format=False, auto_mixed_precision=False, **kwargs): """ Optimize ONNX model with an option to convert it to use mixed precision. """ from optimizer import optimize_model from fusion_options import FusionOptions optimization_options = FusionOptions('gpt2') #optimization_options.enable_gelu = False #optimization_options.enable_layer_norm = False #optimization_options.enable_attention = False m = optimize_model(onnx_model_path, model_type='gpt2', num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0, optimization_options=optimization_options, use_gpu=False) if is_float16: if auto_mixed_precision: Gpt2Helper.auto_mixed_precision(m) else: m.convert_float_to_float16(use_symbolic_shape_infer=True, **kwargs) m.save_model_to_file(optimized_model_path, use_external_data_format)
def optimize_by_fusion( model: ModelProto, model_type: str = "bert", num_heads: int = 0, hidden_size: int = 0, optimization_options: Optional[FusionOptions] = None, ): """Optimize Model by graph fusion logic. Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model. For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters. Args: model (ModelProto): model object model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'. num_heads (int, optional): number of attention heads. Defaults to 0. 0 allows detect the parameter from graph automatically (for model_type "bert" only). hidden_size (int, optional): hidden size. Defaults to 0. 0 allows detect the parameter from graph automatically (for model_type "bert" only). optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None. Returns: object of an optimizer class. """ if model_type != "bert" and (num_heads == 0 or hidden_size == 0): logger.warning( "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'" ) (optimizer_class, producer, _) = MODEL_TYPES[model_type] if model.producer_name and producer != model.producer_name: logger.warning( f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".' "Please specify correct --model_type parameter.") if optimization_options is None: optimization_options = FusionOptions(model_type) optimizer = optimizer_class(model, num_heads, hidden_size) optimizer.optimize(optimization_options) optimizer.topological_sort() optimizer.model.producer_name = "onnxruntime.transformers" from onnxruntime import __version__ as onnxruntime_version optimizer.model.producer_version = onnxruntime_version return optimizer
def optimize_onnx_model( model_name, onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size, use_gpu, precision, use_raw_attention_mask, overwrite, model_fusion_statistics, use_external_data_format, optimization_options=None, ): if overwrite or not os.path.exists(optimized_model_path): Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True) from fusion_options import FusionOptions from optimizer import optimize_model if optimization_options == None: optimization_options = FusionOptions(model_type) optimization_options.use_raw_attention_mask(use_raw_attention_mask) if Precision.FLOAT16 == precision: optimization_options.enable_gelu_approximation = True if Precision.INT8 == precision: optimization_options.enable_embed_layer_norm = False # Use script to optimize model. # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16. # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime. opt_model = optimize_model( onnx_model_path, model_type, num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0, optimization_options=optimization_options, use_gpu=use_gpu, only_onnxruntime=False, ) if model_type == "bert_keras" or model_type == "bert_tf": opt_model.use_dynamic_axes() model_fusion_statistics[ optimized_model_path] = opt_model.get_fused_operator_statistics() if Precision.FLOAT16 == precision: opt_model.convert_float_to_float16(keep_io_types=True) opt_model.save_model_to_file(optimized_model_path, use_external_data_format) else: logger.info( f"Skip optimization since model existed: {optimized_model_path}")
def _parse_arguments(): parser = argparse.ArgumentParser( description= 'Graph optimization tool for ONNX Runtime. It transforms ONNX graph to use optimized operators for Transformer models.' ) parser.add_argument('--input', required=True, type=str, help="input onnx model path") parser.add_argument('--output', required=True, type=str, help="optimized onnx model path") parser.add_argument('--model_type', required=False, type=str.lower, default="bert", choices=list(MODEL_TYPES.keys()), help="Model type selected in the list: " + ", ".join(MODEL_TYPES.keys())) parser.add_argument( '--num_heads', required=False, type=int, default=0, help= "number of attention heads like 12 for bert-base and 16 for bert-large. Default is 0 to detect automatically for BERT. For other model type, this parameter need specify correctly." ) parser.add_argument( '--hidden_size', required=False, type=int, default=0, help= "hidden size like 768 for bert-base and 1024 for bert-large. Default is 0 to detect automatically for BERT. For other model type, this parameter need specify correctly." ) parser.add_argument( '--input_int32', required=False, action='store_true', help= "Use int32 (instead of int64) inputs. It could avoid unnecessary data cast when EmbedLayerNormalization is fused for BERT." ) parser.set_defaults(input_int32=False) parser.add_argument( '--float16', required=False, action='store_true', help= "Convert all weights and nodes in float32 to float16. It has potential loss in precision compared to mixed precision conversion (see convert_float_to_float16)." ) parser.set_defaults(float16=False) FusionOptions.add_arguments(parser) parser.add_argument('--verbose', required=False, action='store_true', help="show debug information.") parser.set_defaults(verbose=False) parser.add_argument( '--use_gpu', required=False, action='store_true', help= "Use GPU for inference. Set this flag if your model is intended for GPU when opt_level > 1." ) parser.set_defaults(use_gpu=False) parser.add_argument( '--only_onnxruntime', required=False, action='store_true', help="optimized by onnxruntime only, and no graph fusion in Python") parser.set_defaults(only_onnxruntime=False) parser.add_argument( '--opt_level', required=False, type=int, choices=[0, 1, 2, 99], default=None, help= "onnxruntime optimization level. 0 will disable onnxruntime graph optimization. The recommended value is 1. When opt_level > 1 is used, optimized model for GPU might not run in CPU. Level 2 and 99 are intended for --only_onnxruntime." ) parser.add_argument( '--use_external_data_format', required=False, action='store_true', help="use external data format to store large model (>2GB)") parser.set_defaults(use_external_data_format=False) args = parser.parse_args() return args
def run_onnxruntime( use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source, args, ): import onnxruntime results = [] if (use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()) and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results warm_up_repeat = 0 if provider == "tensorrt": optimizer_info = OptimizerInfo.NOOPT warm_up_repeat = 5 if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers( ): logger.error( "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance." ) return results if optimizer_info == OptimizerInfo.NOOPT: logger.warning( f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] args.model_type = MODELS[model_name][3] fusion_options = FusionOptions.parse(args) if "pt" in model_source: with torch.no_grad(): ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if "tf" in model_source: ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, provider, enable_all_optimization=True, num_threads=num_threads, verbose=verbose, ) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = [] device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size), ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if "pt" in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, config, input_value_type, ) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "providers": provider, "device": device, "optimizer": optimizer_info, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "custom_layer_num": config_modifier.get_layer_num(), "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort( ort_session, ort_inputs, result_template, repeat_times, batch_size, warm_up_repeat, ) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) output_buffer_max_sizes = [max_last_state_size] for i in range(len(ort_outputs)): if i == 2 and MODELS[model_name][3] == "gpt": # past state output max size output_buffer_max_sizes.append(max_pooler_size) else: output_buffer_max_sizes.append( max_last_state_size) data_type = numpy.longlong if "pt" in model_source else numpy.intc result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, output_buffer_max_sizes, batch_size, device, data_type, warm_up_repeat, ) logger.info(result) results.append(result) return results
def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument( "-m", "--models", required=False, nargs="+", type=str, default=["bert-base-cased", "roberta-base", "gpt2"], choices=list(MODELS.keys()), help="Pre-trained models in the list: " + ", ".join(MODELS.keys()), ) parser.add_argument( "--model_source", required=False, nargs=1, type=str, default="pt", choices=["pt", "tf"], help="Export onnx from pt or tf", ) parser.add_argument( "--model_class", required=False, type=str, default=None, choices=list(MODEL_CLASSES), help="Model type selected in the list: " + ", ".join(MODEL_CLASSES), ) parser.add_argument( "-e", "--engines", required=False, nargs="+", type=str, default=["onnxruntime"], choices=["onnxruntime", "torch", "torchscript", "tensorflow"], help="Engines to benchmark", ) parser.add_argument( "-c", "--cache_dir", required=False, type=str, default=os.path.join(".", "cache_models"), help="Directory to cache pre-trained models", ) parser.add_argument( "--onnx_dir", required=False, type=str, default=os.path.join(".", "onnx_models"), help="Directory to store onnx models", ) parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device") parser.add_argument( "--provider", required=False, type=str, default=None, help="Execution provider to use", ) parser.add_argument( "-p", "--precision", type=Precision, default=Precision.FLOAT32, choices=list(Precision), help= "Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization", ) parser.add_argument("--verbose", required=False, action="store_true", help="Print more information") parser.add_argument( "--overwrite", required=False, action="store_true", help="Overwrite existing models", ) parser.add_argument( "-o", "--optimizer_info", type=OptimizerInfo, default=OptimizerInfo.BYSCRIPT, choices=list(OptimizerInfo), help= "Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt", ) parser.add_argument( "-v", "--validate_onnx", required=False, action="store_true", help="Validate ONNX model", ) parser.add_argument( "-f", "--fusion_csv", required=False, default=None, help="CSV file for saving summary results of graph optimization.", ) parser.add_argument( "-d", "--detail_csv", required=False, default=None, help="CSV file for saving detail results.", ) parser.add_argument( "-r", "--result_csv", required=False, default=None, help="CSV file for saving summary results.", ) parser.add_argument( "-i", "--input_counts", required=False, nargs="+", default=[1], type=int, choices=[1, 2, 3], help= "Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.", ) parser.add_argument( "-t", "--test_times", required=False, default=100, type=int, help="Number of repeat times to get average inference latency.", ) parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1]) parser.add_argument( "-s", "--sequence_lengths", nargs="+", type=int, default=[4, 8, 16, 32, 64, 128, 256], ) parser.add_argument( "--disable_ort_io_binding", required=False, action="store_true", help="Disable running ONNX Runtime with binded inputs and outputs. ", ) parser.set_defaults(disable_ort_io_binding=False) parser.add_argument( "-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use", ) parser.add_argument( "--force_num_layers", required=False, type=int, default=None, help="Manually set the model's layer number", ) FusionOptions.add_arguments(parser) args = parser.parse_args() return args
def optimize_model(input, model_type='bert', num_heads=0, hidden_size=0, optimization_options=None, opt_level=None, use_gpu=False, only_onnxruntime=False): """ Optimize Model by OnnxRuntime and/or python fusion logic. ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/resources/graph-optimizations.html). However, the coverage is limited. We also have graph fusions that implemented in Python to improve the coverage. They can combined: ONNX Runtime will run first when opt_level > 0, then graph fusions in Python will be applied. To use ONNX Runtime only and no Python fusion logic, use only_onnxruntime flag and a positive opt_level like optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True) When opt_level is None, we will choose default optimization level according to model type. When opt_level is 0 and only_onnxruntime is False, only python fusion logic is used and onnxruntime is disabled. When opt_level > 1, use_gpu shall set properly since the optimized graph might contain operators for GPU or CPU only. If your model is intended for GPU inference only (especially float16 or mixed precision model), it is recommended to set use_gpu to be True, otherwise the model is not optimized for GPU inference. For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters. Args: input (str): input model path. model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'. num_heads (int, optional): number of attention heads. Defaults to 0. 0 allows detect the parameter from graph automatically (for model_type "bert" only). hidden_size (int, optional): hidden size. Defaults to 0. 0 allows detect the parameter from graph automatically (for model_type "bert" only). optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None. opt_level (int, optional): onnxruntime graph optimization level (0, 1, 2 or 99) or None. Defaults to None. When the value is None, default value (1 for bert and gpt2, 0 for other model types) will be used. When the level > 0, onnxruntime will be used to optimize model first. use_gpu (bool, optional): use gpu or not for onnxruntime. Defaults to False. only_onnxruntime (bool, optional): only use onnxruntime to optimize model, and no python fusion. Defaults to False. Returns: object of an optimizer class. """ assert opt_level is None or opt_level in [0, 1, 2, 99] if model_type != "bert" and (num_heads == 0 or hidden_size == 0): logger.warning("Please specify parameters of num_heads and hidden_size when model_type is not 'bert'") (optimizer_class, producer, default_opt_level) = MODEL_TYPES[model_type] if opt_level is None: opt_level = default_opt_level temp_model_path = None if opt_level > 1: temp_model_path = optimize_by_onnxruntime(input, use_gpu=use_gpu, opt_level=opt_level) elif opt_level == 1: # basic optimizations (like constant folding and cast elimation) are not specified to exection provider. # CPU provider is used here so that there is no extra node for GPU memory copy. temp_model_path = optimize_by_onnxruntime(input, use_gpu=False, opt_level=1) if only_onnxruntime and not temp_model_path: logger.warning("Please specify a positive value for opt_level when only_onnxruntime is True") model = load_model(temp_model_path or input, format=None, load_external_data=True) if model.producer_name and producer != model.producer_name: logger.warning( f"Model producer not matched: Expect {producer}, Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter." ) if optimization_options is None: optimization_options = FusionOptions(model_type) optimizer = optimizer_class(model, num_heads, hidden_size) if not only_onnxruntime: optimizer.optimize(optimization_options) # Remove the temporary model. if temp_model_path: os.remove(temp_model_path) logger.debug("Remove tempoary model: {}".format(temp_model_path)) optimizer.model.producer_name = "onnxruntime.transformers" from onnxruntime import __version__ as onnxruntime_version optimizer.model.producer_version = onnxruntime_version return optimizer
def _parse_arguments(): parser = argparse.ArgumentParser( description= 'Graph optimization tool for ONNX Runtime. It transforms ONNX graph to use optimized operators for Transformer models.' ) parser.add_argument('--input', required=True, type=str, help="input onnx model path") parser.add_argument('--output', required=True, type=str, help="optimized onnx model path") parser.add_argument('--model_type', required=False, type=str.lower, default="bert", choices=list(MODEL_TYPES.keys()), help="Model type selected in the list: " + ", ".join(MODEL_TYPES.keys())) parser.add_argument( '--num_heads', required=False, type=int, default=12, help= "number of attention heads. 12 for bert-base model and 16 for bert-large. For BERT, set it to 0 to detect automatically." ) parser.add_argument( '--hidden_size', required=False, type=int, default=768, help= "bert model hidden size. 768 for bert-base model and 1024 for bert-large. For BERT, set it to 0 to detect automatically." ) parser.add_argument('--input_int32', required=False, action='store_true', help="Use int32 (instead of int64) tensor as input to avoid unnecessary data cast.") parser.set_defaults(input_int32=False) parser.add_argument( '--float16', required=False, action='store_true', help= "If your target device is V100 or T4 GPU, try this to convert float32 to float16 for best performance (with potential loss in precision)." ) parser.set_defaults(float16=False) FusionOptions.add_arguments(parser) parser.add_argument('--verbose', required=False, action='store_true', help="show debug information.") parser.set_defaults(verbose=False) parser.add_argument( '--use_gpu', required=False, action='store_true', help="use GPU for inference. Set this flag if your model is intended for GPU and opt_level > 1.") parser.set_defaults(use_gpu=False) parser.add_argument('--only_onnxruntime', required=False, action='store_true', help="optimized by onnxruntime only, and no graph fusion in Python") parser.set_defaults(only_onnxruntime=False) parser.add_argument( '--opt_level', required=False, type=int, choices=[0, 1, 2, 99], default=None, help= "onnxruntime optimization level. 0 will disable onnxruntime graph optimization. Graph fusion in Python is not impacted by setting." ) parser.add_argument('--use_external_data_format', required=False, action='store_true', help="use external data format") parser.set_defaults(use_external_data_format=False) args = parser.parse_args() return args