def optimize_onnx_model(onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size, use_gpu, fp16, overwrite): if overwrite or not os.path.exists(optimized_model_path): from optimizer import optimize_model from onnx_model_bert import BertOptimizationOptions optimization_options = BertOptimizationOptions(model_type) if fp16: optimization_options.enable_gelu_approximation = True # Use script to optimize model. # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16. # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime. opt_model = optimize_model(onnx_model_path, model_type, num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0, optimization_options=optimization_options, use_gpu=use_gpu, only_onnxruntime=False) model_fusion_statistics[ optimized_model_path] = opt_model.get_fused_operator_statistics() if fp16: opt_model.convert_model_float32_to_float16() opt_model.save_model_to_file(optimized_model_path) else: logger.info( f"Skip optimization since model existed: {optimized_model_path}")
def optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size, use_gpu, precision, use_raw_attention_mask, overwrite, model_fusion_statistics, use_external_data_format): if overwrite or not os.path.exists(optimized_model_path): Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True) from optimizer import optimize_model from onnx_model_bert import BertOptimizationOptions optimization_options = BertOptimizationOptions(model_type) optimization_options.use_raw_attention_mask(use_raw_attention_mask) if Precision.FLOAT16 == precision: optimization_options.enable_gelu_approximation = True if Precision.INT8 == precision: optimization_options.enable_embed_layer_norm = False # Use script to optimize model. # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16. # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime. opt_model = optimize_model(onnx_model_path, model_type, num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0, optimization_options=optimization_options, use_gpu=use_gpu, only_onnxruntime=False) if model_type == 'bert_keras': opt_model.use_dynamic_axes() model_fusion_statistics[ optimized_model_path] = opt_model.get_fused_operator_statistics() if Precision.FLOAT16 == precision: opt_model.convert_model_float32_to_float16() if model_name in EXEMPT_MODELS: use_external_data_format = False opt_model.save_model_to_file(optimized_model_path, use_external_data_format) else: logger.info( f"Skip optimization since model existed: {optimized_model_path}")
def optimize_model(input, model_type='bert', num_heads=12, hidden_size=768, optimization_options=None, opt_level=0, use_gpu=False, only_onnxruntime=False): """ Optimize Model by OnnxRuntime and/or offline fusion logic. The following optimizes model by OnnxRuntime only, and no offline fusion logic: optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True) If you want to optimize model by offline fusion logic. optimize_model(input, model_type, num_heads=12, hidden_size=768, optimization_options=your_options) Args: input (str): input model path. model_type (str): model type - like bert, bert_tf, bert_keras or gpt2. num_heads (int): number of attention heads. hidden_size (int): hidden size. optimization_options (OptimizationOptions or None): optimization options that can use to turn on/off some fusions. opt_level (int): onnxruntime graph optimization level (0, 1, 2 or 99). When the level > 0, onnxruntime will be used to optimize model first. use_gpu (bool): use gpu or not for onnxruntime. only_onnxruntime (bool): only use onnxruntime to optimize model, and no offline fusion logic is used. Returns: object of an optimizer class. """ (optimizer_class, producer, run_onnxruntime) = MODEL_CLASSES[model_type] temp_model_path = None if opt_level > 1: # Optimization specified for an execution provider. temp_model_path = optimize_by_onnxruntime(input, use_gpu=use_gpu, opt_level=opt_level) elif run_onnxruntime: # Use Onnxruntime to do optimizations (like constant folding and cast elimation) that is not specified to exection provider. # CPU provider is used here so that there is no extra node for GPU memory copy. temp_model_path = optimize_by_onnxruntime(input, use_gpu=False, opt_level=1) model = load_model(temp_model_path or input, format=None, load_external_data=True) if model.producer_name and producer != model.producer_name: logger.warning( f"Model producer not matched: Expect {producer}, Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter." ) if optimization_options is None: optimization_options = BertOptimizationOptions(model_type) optimizer = optimizer_class(model, num_heads, hidden_size) if not only_onnxruntime: optimizer.optimize(optimization_options) # Remove the temporary model. if temp_model_path: os.remove(temp_model_path) logger.debug("Remove tempoary model: {}".format(temp_model_path)) optimizer.model.producer_name = "onnxruntime_tools" optimizer.model.producer_version = "1.5.1" return optimizer
def _get_optimization_options(args): optimization_options = BertOptimizationOptions(args.model_type) if args.disable_gelu: optimization_options.enable_gelu = False if args.disable_layer_norm: optimization_options.enable_layer_norm = False if args.disable_attention: optimization_options.enable_attention = False if args.disable_skip_layer_norm: optimization_options.enable_skip_layer_norm = False if args.disable_embed_layer_norm: optimization_options.enable_embed_layer_norm = False if args.disable_bias_skip_layer_norm: optimization_options.enable_bias_skip_layer_norm = False if args.disable_bias_gelu: optimization_options.enable_bias_gelu = False if args.enable_gelu_approximation: optimization_options.enable_gelu_approximation = True if args.use_mask_index: optimization_options.use_raw_attention_mask(False) if args.no_attention_mask: optimization_options.disable_attention_mask() return optimization_options