Example #1
0
def optimize_onnx_model(onnx_model_path, optimized_model_path, model_type,
                        num_attention_heads, hidden_size, use_gpu, fp16,
                        overwrite):
    if overwrite or not os.path.exists(optimized_model_path):
        from optimizer import optimize_model
        from onnx_model_bert import BertOptimizationOptions
        optimization_options = BertOptimizationOptions(model_type)
        if fp16:
            optimization_options.enable_gelu_approximation = True

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16.
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(onnx_model_path,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=0,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=False)
        model_fusion_statistics[
            optimized_model_path] = opt_model.get_fused_operator_statistics()

        if fp16:
            opt_model.convert_model_float32_to_float16()
        opt_model.save_model_to_file(optimized_model_path)
    else:
        logger.info(
            f"Skip optimization since model existed: {optimized_model_path}")
Example #2
0
def optimize_onnx_model(model_name, onnx_model_path, optimized_model_path,
                        model_type, num_attention_heads, hidden_size, use_gpu,
                        precision, use_raw_attention_mask, overwrite,
                        model_fusion_statistics, use_external_data_format):
    if overwrite or not os.path.exists(optimized_model_path):
        Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True)

        from optimizer import optimize_model
        from onnx_model_bert import BertOptimizationOptions
        optimization_options = BertOptimizationOptions(model_type)
        optimization_options.use_raw_attention_mask(use_raw_attention_mask)
        if Precision.FLOAT16 == precision:
            optimization_options.enable_gelu_approximation = True
        if Precision.INT8 == precision:
            optimization_options.enable_embed_layer_norm = False

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16.
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(onnx_model_path,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=0,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=False)
        if model_type == 'bert_keras':
            opt_model.use_dynamic_axes()

        model_fusion_statistics[
            optimized_model_path] = opt_model.get_fused_operator_statistics()

        if Precision.FLOAT16 == precision:
            opt_model.convert_model_float32_to_float16()

        if model_name in EXEMPT_MODELS:
            use_external_data_format = False

        opt_model.save_model_to_file(optimized_model_path,
                                     use_external_data_format)
    else:
        logger.info(
            f"Skip optimization since model existed: {optimized_model_path}")
Example #3
0
def optimize_model(input,
                   model_type='bert',
                   num_heads=12,
                   hidden_size=768,
                   optimization_options=None,
                   opt_level=0,
                   use_gpu=False,
                   only_onnxruntime=False):
    """ Optimize Model by OnnxRuntime and/or offline fusion logic.

    The following optimizes model by OnnxRuntime only, and no offline fusion logic:
        optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True)
    If you want to optimize model by offline fusion logic.
        optimize_model(input, model_type, num_heads=12, hidden_size=768, optimization_options=your_options)

    Args:
        input (str): input model path.
        model_type (str): model type - like bert, bert_tf, bert_keras or gpt2.
        num_heads (int): number of attention heads.
        hidden_size (int): hidden size.
        optimization_options (OptimizationOptions or None): optimization options that can use to turn on/off some fusions.
        opt_level (int): onnxruntime graph optimization level (0, 1, 2 or 99). When the level > 0, onnxruntime will be used to optimize model first.
        use_gpu (bool): use gpu or not for onnxruntime.
        only_onnxruntime (bool): only use onnxruntime to optimize model, and no offline fusion logic is used.

     Returns:
        object of an optimizer class.
    """
    (optimizer_class, producer, run_onnxruntime) = MODEL_CLASSES[model_type]

    temp_model_path = None
    if opt_level > 1:  # Optimization specified for an execution provider.
        temp_model_path = optimize_by_onnxruntime(input,
                                                  use_gpu=use_gpu,
                                                  opt_level=opt_level)
    elif run_onnxruntime:
        # Use Onnxruntime to do optimizations (like constant folding and cast elimation) that is not specified to exection provider.
        # CPU provider is used here so that there is no extra node for GPU memory copy.
        temp_model_path = optimize_by_onnxruntime(input,
                                                  use_gpu=False,
                                                  opt_level=1)

    model = load_model(temp_model_path or input,
                       format=None,
                       load_external_data=True)

    if model.producer_name and producer != model.producer_name:
        logger.warning(
            f"Model producer not matched: Expect {producer},  Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter."
        )

    if optimization_options is None:
        optimization_options = BertOptimizationOptions(model_type)

    optimizer = optimizer_class(model, num_heads, hidden_size)

    if not only_onnxruntime:
        optimizer.optimize(optimization_options)

    # Remove the temporary model.
    if temp_model_path:
        os.remove(temp_model_path)
        logger.debug("Remove tempoary model: {}".format(temp_model_path))

    optimizer.model.producer_name = "onnxruntime_tools"
    optimizer.model.producer_version = "1.5.1"

    return optimizer
Example #4
0
def _get_optimization_options(args):
    optimization_options = BertOptimizationOptions(args.model_type)
    if args.disable_gelu:
        optimization_options.enable_gelu = False
    if args.disable_layer_norm:
        optimization_options.enable_layer_norm = False
    if args.disable_attention:
        optimization_options.enable_attention = False
    if args.disable_skip_layer_norm:
        optimization_options.enable_skip_layer_norm = False
    if args.disable_embed_layer_norm:
        optimization_options.enable_embed_layer_norm = False
    if args.disable_bias_skip_layer_norm:
        optimization_options.enable_bias_skip_layer_norm = False
    if args.disable_bias_gelu:
        optimization_options.enable_bias_gelu = False
    if args.enable_gelu_approximation:
        optimization_options.enable_gelu_approximation = True
    if args.use_mask_index:
        optimization_options.use_raw_attention_mask(False)
    if args.no_attention_mask:
        optimization_options.disable_attention_mask()

    return optimization_options