Ejemplo n.º 1
0
def optimize_onnx_model(onnx_model_path, optimized_model_path, model_type,
                        num_attention_heads, hidden_size, use_gpu, fp16,
                        overwrite):
    if overwrite or not os.path.exists(optimized_model_path):
        from optimizer import optimize_model
        from BertOnnxModel import BertOptimizationOptions
        optimization_options = BertOptimizationOptions(model_type)
        if fp16:
            optimization_options.enable_gelu_approximation = True

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16.
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(onnx_model_path,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=0,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=False)
        model_fusion_statistics[
            optimized_model_path] = opt_model.get_fused_operator_statistics()

        if fp16:
            opt_model.convert_model_float32_to_float16()
        opt_model.save_model_to_file(optimized_model_path)
    else:
        logger.info(
            f"Skip optimization since model existed: {optimized_model_path}")
def optimize_model(input,
                   model_type,
                   num_heads,
                   hidden_size,
                   opt_level=99,
                   optimization_options=None):
    (optimizer_class, producer, run_onnxruntime) = MODEL_CLASSES[model_type]

    input_model_path = input
    if run_onnxruntime and opt_level > 0:
        input_model_path = optimize_by_onnxruntime(input_model_path,
                                                   use_gpu=False,
                                                   opt_level=opt_level)
        logger.info(
            "Use OnnxRuntime to optimize and save the optimized model to {}".
            format(input_model_path))

    model = ModelProto()
    with open(input_model_path, "rb") as f:
        model.ParseFromString(f.read())

    if model.producer_name and producer != model.producer_name:
        logger.warning(
            f"Model producer not matched: Expect {producer},  Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter."
        )

    if optimization_options is None:
        optimization_options = BertOptimizationOptions(model_type)

    bert_model = optimizer_class(model, num_heads, hidden_size)
    bert_model.optimize(optimization_options)

    return bert_model
Ejemplo n.º 3
0
def optimize_model(input,
                   model_type,
                   num_heads,
                   hidden_size,
                   opt_level=0,
                   optimization_options=None,
                   use_gpu=False,
                   only_onnxruntime=False):
    (optimizer_class, producer, run_onnxruntime) = MODEL_CLASSES[model_type]

    input_model_path = input

    if opt_level > 1: # Optimization specified for an execution provider.
        input_model_path = optimize_by_onnxruntime(input_model_path, use_gpu=use_gpu, opt_level=opt_level)
    elif run_onnxruntime:
        # Use Onnxruntime to do optimizations (like constant folding and cast elimation) that is not specified to exection provider.
        # CPU provider is used here so that there is no extra node for GPU memory copy.
        input_model_path = optimize_by_onnxruntime(input_model_path, use_gpu=False, opt_level=1)

    model = load_model(input_model_path, format=None, load_external_data=True)

    if model.producer_name and producer != model.producer_name:
        logger.warning(
            f"Model producer not matched: Expect {producer},  Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter."
        )

    if optimization_options is None:
        optimization_options = BertOptimizationOptions(model_type)

    bert_model = optimizer_class(model, num_heads, hidden_size)

    if not only_onnxruntime:
        bert_model.optimize(optimization_options)

    return bert_model
Ejemplo n.º 4
0
def optimize_model(input,
                   model_type='bert',
                   num_heads=12,
                   hidden_size=768,
                   optimization_options=None,
                   opt_level=0,
                   use_gpu=False,
                   only_onnxruntime=False):
    """ Optimize Model by OnnxRuntime and/or offline fusion logic.

    The following optimizes model by OnnxRuntime only, and no offline fusion logic:
        optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True)
    If you want to optimize model by offline fusion logic.
        optimize_model(input, model_type, num_heads=12, hidden_size=768, optimization_options=your_options)

    Args:
        input (str): input model path.
        model_type (str): model type - like bert, bert_tf, bert_keras or gpt2.
        num_heads (int): number of attention heads.
        hidden_size (int): hidden size.
        optimization_options (OptimizationOptions or None): optimization options that can use to turn on/off some fusions.
        opt_level (int): onnxruntime graph optimization level (0, 1, 2 or 99). When the level > 0, onnxruntime will be used to optimize model first.
        use_gpu (bool): use gpu or not for onnxruntime.
        only_onnxruntime (bool): only use onnxruntime to optimize model, and no offline fusion logic is used.

     Returns:
        object of an optimizer class.
    """
    (optimizer_class, producer, run_onnxruntime) = MODEL_CLASSES[model_type]

    input_model_path = input

    if opt_level > 1:  # Optimization specified for an execution provider.
        input_model_path = optimize_by_onnxruntime(input_model_path,
                                                   use_gpu=use_gpu,
                                                   opt_level=opt_level)
    elif run_onnxruntime:
        # Use Onnxruntime to do optimizations (like constant folding and cast elimation) that is not specified to exection provider.
        # CPU provider is used here so that there is no extra node for GPU memory copy.
        input_model_path = optimize_by_onnxruntime(input_model_path,
                                                   use_gpu=False,
                                                   opt_level=1)

    model = load_model(input_model_path, format=None, load_external_data=True)

    if model.producer_name and producer != model.producer_name:
        logger.warning(
            f"Model producer not matched: Expect {producer},  Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter."
        )

    if optimization_options is None:
        optimization_options = BertOptimizationOptions(model_type)

    optimizer = optimizer_class(model, num_heads, hidden_size)

    if not only_onnxruntime:
        optimizer.optimize(optimization_options)

    return optimizer
Ejemplo n.º 5
0
def get_optimization_options(args):
    optimization_options = BertOptimizationOptions(args.model_type)
    if args.disable_gelu:
        optimization_options.enable_gelu = False
    if args.disable_layer_norm:
        optimization_options.enable_layer_norm = False
    if args.disable_attention:
        optimization_options.enable_attention = False
    if args.disable_skip_layer_norm:
        optimization_options.enable_skip_layer_norm = False
    if args.disable_embed_layer_norm:
        optimization_options.enable_embed_layer_norm = False
    if args.disable_bias_skip_layer_norm:
        optimization_options.enable_bias_skip_layer_norm = False
    if args.disable_bias_gelu:
        optimization_options.enable_bias_gelu = False
    if args.enable_gelu_approximation:
        optimization_options.enable_gelu_approximation = True
    return optimization_options
Ejemplo n.º 6
0
def optimize_onnx_model(onnx_model_filename, model_type, num_attention_heads, hidden_size, use_gpu, fp16, overwrite):
    suffix =  "_fp{}_{}.onnx".format(16 if fp16 else 32, "gpu" if use_gpu else "cpu")
    optimized_model_filename = onnx_model_filename.replace(".onnx", suffix)
    if overwrite or not os.path.exists(optimized_model_filename):
        from optimizer import optimize_model
        from BertOnnxModel import BertOptimizationOptions
        optimization_options = BertOptimizationOptions(model_type)
        if fp16:
            optimization_options.enable_gelu_approximation = True

        # Use onnxruntime to optimize model, which will be saved to *_ort_cpu.onnx
        opt_model = optimize_model(onnx_model_filename,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=99,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=True)
        model_fusion_statistics[onnx_model_filename] = opt_model.get_fused_operator_statistics()

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16. 
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(onnx_model_filename,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=0,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=False)
        model_fusion_statistics[optimized_model_filename] = opt_model.get_fused_operator_statistics()

        if fp16:
            opt_model.convert_model_float32_to_float16()
        opt_model.save_model_to_file(optimized_model_filename)
    else:
        logger.info(f"Skip optimization since model existed: {optimized_model_filename}")
    return optimized_model_filename