def __init__( self, model_name, pipeline_name, model_path=None, device="cpu", quantization=False, opset=11, force=False, **convert_kwargs ): if model_path == None: model_path = f"onnx/{model_name}.onnx" model_path = Path(model_path) if not model_path.is_file() or force: convert( framework="pt", model=model_name, output=model_path, opset=opset, pipeline_name=pipeline_name, **convert_kwargs, ) if quantization: model_path = optimize(model_path) model_path = quantize(model_path) self.model_path = str(model_path) self.provider = "CPUExecutionProvider" if device == "cpu" else "CUDAExecutionProvider" self.session = self.create_model_for_provider() self.config = AutoConfig.from_pretrained(model_name)
def test_quantize_pytorch(self): for model in OnnxExportTestCase.MODEL_TO_TEST: path = self._test_export(model, "pt", 12) quantized_path = quantize(path) # Ensure the actual quantized model is not bigger than the original one if quantized_path.stat().st_size >= Path(path).stat().st_size: self.fail("Quantized model is bigger than initial ONNX model")
def test_quantize_tf(self): for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST: path = self._test_export(model, "tf", 12, **model_kwargs) quantized_path = quantize(Path(path)) # Ensure the actual quantized model is not bigger than the original one if quantized_path.stat().st_size >= Path(path).stat().st_size: self.fail("Quantized model is bigger than initial ONNX model")
def _save_impl(self, path, export_type=None): path = Path(path) self._save_gpu(path) if export_type == 'cpu': with TemporaryDirectory() as temp_dir: temp_model_path = Path(temp_dir) / 'temp.onnx' convert(framework='pt', model=str(path), output=temp_model_path, pipeline_name='sentiment-analysis', opset=11) optimized_path = optimize(temp_model_path) quantized_path = quantize(optimized_path) target_path = BertCpuClassifier.onnx_model_path(path) with open(quantized_path, 'rb') as src, gzip.open(target_path, 'wb') as dst: shutil.copyfileobj(src, dst) os.remove(path / 'pytorch_model.bin')
from pathlib import Path import tempfile from transformers.convert_graph_to_onnx import convert, quantize dest = Path(tempfile.mkdtemp(), "question-answering.onnx") convert(pipeline_name="question-answering", model="distilbert-base-cased-distilled-squad", output=dest, framework="pt", opset=11) quantize(dest)
def export_model_to_onnx(self, fpath, quantize=False, target_opset=None, verbose=1): """ Export model to onnx Args: fpath(str): String representing full path to model file where ONNX model will be saved. Example: '/tmp/my_model.onnx' quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 1) ONNX model (created directly using keras2onnx 2) an optimized ONNX model (created by transformers library) 3) a quantized version of optimized ONNX model (created by transformers library) All files will be created in the parent folder of fpath: Example: If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and /tmp/model-optimized-quantized.onnx will also be created. verbose(bool): verbosity Returns: str: string representing fpath. If quantize=True, returned fpath will be different than supplied fpath """ try: import onnxruntime, onnxruntime_tools, onnx, keras2onnx except ImportError: raise Exception('This method requires ONNX libraries to be installed: '+\ 'pip install -q --upgrade onnxruntime==1.5.1 onnxruntime-tools onnx keras2onnx') from pathlib import Path if type(self.preproc).__name__ == 'BERTPreprocessor': raise Exception('currently_unsupported: BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\ 'Only BERT models created with Transformer(...) are supported.') if verbose: print( 'converting to ONNX format ... this may take a few moments...') if U.is_huggingface(model=self.model): tokenizer = self.preproc.get_tokenizer() maxlen = self.preproc.maxlen input_dict = tokenizer('Name', return_tensors='tf', padding='max_length', max_length=maxlen) if version.parse(tf.__version__) < version.parse('2.2'): raise Exception( 'export_model_to_tflite requires tensorflow>=2.2') #self.model._set_inputs(input_spec, training=False) # for tf < 2.2 self.model._saved_model_inputs_spec = None # for tf > 2.2 self.model._set_save_spec(input_dict) # for tf > 2.2 self.model._get_save_spec() onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset) keras2onnx.save_model(onnx_model, fpath) return_fpath = fpath if quantize: from transformers.convert_graph_to_onnx import optimize, quantize #opt_path = optimize(Path(fpath)) if U.is_huggingface(model=self.model) and\ type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']: try: from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( fpath, 'bert', # bert_keras causes error with transformers num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file(fpath) except: warnings.warn('Could not run BERT-specific optimizations') pass quantize_path = quantize(Path(fpath)) return_fpath = quantize_path.as_posix() if verbose: print('done.') return return_fpath
args.country_code][f'iter{str(args.iteration_number)}'][label], label, 'models', 'best_model') onnx_path = os.path.join(model_path, 'onnx') try: shutil.rmtree( onnx_path ) # deleting onxx folder and contents, if exists, conversion excepts except: logger.info('no existing folder, creating one') os.makedirs(onnx_path) logger.info('>> converting..') convert(framework="pt", model=model_path, tokenizer=convert_model_path_to_model_name(model_path), output=Path(os.path.join(onnx_path, 'converted.onnx')), opset=11, pipeline_name='sentiment-analysis') logger.info('>> ONNX optimization') optimized_output = optimize(Path(os.path.join(onnx_path, 'converted.onnx'))) logger.info('>> Quantization') quantized_output = quantize(optimized_output) logger.info('>> Verification') verify(Path(os.path.join(onnx_path, 'converted.onnx'))) verify(optimized_output) verify(quantized_output)
time_buffer = [] for _ in trange(100): with track_infer_time(time_buffer): model_pt_quantized(**model_inputs) results["PyTorch CPU Quantized"] = OnnxInferenceResult(time_buffer, None) # print(results) """"""""""""""""""""""""""""""""""" Benchmarking ONNX quantized model """"""""""""""""""""""""""""""""""" from transformers.convert_graph_to_onnx import quantize # Transformers allow you to easily convert float32 model to quantize int8 with ONNX Runtime quantized_model_path = quantize(Path("onnx/bert.opt.onnx")) # Then you just have to load through ONNX runtime as you would normally do quantized_model = create_model_for_provider(quantized_model_path.as_posix(), "CPUExecutionProvider") # Warm up the overall model to have a fair comparision outputs = quantized_model.run(None, inputs_onnx) # Evaluate performances time_buffer = [] for _ in trange(100, desc=f"Tracking inference time on CPUExecutionProvider with quantized model"): with track_infer_time(time_buffer): outputs = quantized_model.run(None, inputs_onnx) # Store the result results["ONNX CPU Quantized"] = OnnxInferenceResult(time_buffer, quantized_model_path)
def export_model_to_onnx(self, fpath, quantize=False, target_opset=None, verbose=1): """ ``` Export model to onnx Args: fpath(str): String representing full path to model file where ONNX model will be saved. Example: '/tmp/my_model.onnx' quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 1) ONNX model (created directly using keras2onnx 2) an optimized ONNX model (created by transformers library) 3) a quantized version of optimized ONNX model (created by transformers library) All files will be created in the parent folder of fpath: Example: If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and /tmp/model-optimized-quantized.onnx will also be created. verbose(bool): verbosity Returns: str: string representing fpath. If quantize=True, returned fpath will be different than supplied fpath ``` """ try: import onnx import onnxruntime except ImportError: raise Exception( "This method requires ONNX libraries to be installed: " + "pip install -q --upgrade onnxruntime==1.10.0 onnx sympy tf2onnx" ) from pathlib import Path if type(self.preproc).__name__ == "BERTPreprocessor": raise Exception( 'currently_unsupported: BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' + "Only BERT models created with Transformer(...) are supported." ) if verbose: print( "converting to ONNX format by way of TFLite ... this may take a few moments..." ) if U.is_huggingface(model=self.model): tokenizer = self.preproc.get_tokenizer() maxlen = self.preproc.maxlen input_dict = tokenizer( "Name", return_tensors="tf", padding="max_length", max_length=maxlen ) if version.parse(tf.__version__) < version.parse("2.2"): raise Exception("export_model_to_tflite requires tensorflow>=2.2") # self.model._set_inputs(input_spec, training=False) # for tf < 2.2 self.model._saved_model_inputs_spec = None # for tf > 2.2 self.model._set_save_spec(input_dict) # for tf > 2.2 self.model._get_save_spec() # onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset) # keras2onnx.save_model(onnx_model, fpath) tflite_model_path = self.export_model_to_tflite( fpath + "-TFLITE_TMP", verbose=verbose ) import subprocess if verbose: print("converting to ONNX using tf2onnx...") proc = subprocess.run( f"python -m tf2onnx.convert --tflite {tflite_model_path} --output {fpath}".split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if verbose: print(proc.returncode) print(proc.stdout.decode("ascii")) print(proc.stderr.decode("ascii")) return_fpath = fpath if quantize: from transformers.convert_graph_to_onnx import optimize, quantize # opt_path = optimize(Path(fpath)) if U.is_huggingface(model=self.model) and type(self.model).__name__ in [ "TFDistilBertForSequenceClassification", "TFBertForSequenceClassification", ]: try: from onnxruntime.transformers import optimizer from onnxruntime.transformers.onnx_model_bert import ( BertOptimizationOptions, ) # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions("bert") opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( fpath, "bert", # bert_keras causes error with transformers num_heads=12, hidden_size=768, optimization_options=opt_options, ) opt_model.save_model_to_file(fpath) except: warnings.warn("Could not run BERT-specific optimizations") pass quantize_path = quantize(Path(fpath)) return_fpath = quantize_path.as_posix() if verbose: print("done.") return return_fpath