def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU. Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import QuantizationMode, quantize onnx_model = onnx.load(onnx_model_path.as_posix()) # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime print( "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" "This limitation will be removed in the next release of onnxruntime.") quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename( onnx_model_path, "-quantized") # Save model print( f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}" ) onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path
# onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch if args.framework == "tf": print( "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" "\t For more information, please refer to the onnxruntime documentation:\n" "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" ) print("\n====== Optimizing ONNX model ======") # Quantization works best when using the optimized version of the model args.optimized_output = optimize(args.output) # Do the quantization on the right graph args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: print("\n====== Check exported ONNX model(s) ======") verify(args.output) if hasattr(args, "optimized_output"): verify(args.optimized_output) if hasattr(args, "quantized_output"): verify(args.quantized_output) except Exception as e: print(f"Error while converting the model: {e}") exit(1)
model=model_path, tokenizer=args.model_name, output=onnx_path + 'converted.onnx', opset=11) print('>> optimizing..') # ONNX optimization optimized_model = optimizer.optimize_model(onnx_path + '/converted.onnx', model_type=args.model_type, num_heads=12, hidden_size=768) optimized_onnx_model_path = os.path.join(onnx_path, 'bert_optimized.onnx') optimized_model.save_model_to_file(optimized_onnx_model_path) print('Optimized model saved at :', optimized_onnx_model_path) print('>> quantizing..') model = onnx.load(onnx_path + '/converted.onnx') quantized_model = quantize(model=model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True) optimized_quantized_onnx_model_path = os.path.join( os.path.dirname(optimized_onnx_model_path), 'ONNX_model_optimized_quantized.onnx') onnx.save_model(quantized_model, optimized_quantized_onnx_model_path) print('Quantized&optimized model saved at :', optimized_quantized_onnx_model_path) # break
model = onnx.load( "C:/Users/82109/PycharmProjects/Quantization/original_model.onnx") sess0 = onnxruntime.InferenceSession( "C:/Users/82109/PycharmProjects/Quantization/original_model.onnx") input_name = sess0.get_inputs()[0].name n = 1000 start = time.time() pred0_onnx = sess0.run(None, {input_name: x_test[:n].astype(np.float32)}) print("ori_pred : ", (time.time() - start) / n) snapshot = tracemalloc.take_snapshot() top_stats = snapshot.statistics('lineno') quantized_model = quantize(model, quantization_mode=QuantizationMode.IntegerOps) onnx.save( quantized_model, "C:/Users/82109/PycharmProjects/Quantization/orginal_model_test.onnx") Q_model = onnx.load( "C:/Users/82109/PycharmProjects/Quantization/orginal_model_test.onnx") sess = onnxruntime.InferenceSession( "C:/Users/82109/PycharmProjects/Quantization/orginal_model_test.onnx") input_name = sess.get_inputs()[0].name start = time.time() pred_onnx = sess.run(None, {input_name: x_test[:n].astype(np.float32)}) print("Q_pred : ", (time.time() - start) / n)
args = parser.parse_args() # Make sure output is absolute path args.output = Path(args.output).absolute() try: # Convert convert( args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format, args.pipeline, ) if args.quantize: args.quantized_output = quantize(args.output) # And verify if args.check_loading: verify(args.output) if hasattr(args, "quantized_output"): verify(args.quantized_output) except Exception as e: print(f"Error while converting the model: {e}") exit(1)