def convert(self, model, saved_path, model_name): """ Convert bert model (transformers) to onnx optimized model. :param model: Trained model from transformers. :param saved_path: The path to save onnx model. :param model_name: Choose a model name to save. :returns optimized_model: Optimized onnx model. optimized_model_saved_path: optimized model saved path. """ if not os.path.exists(saved_path): os.makedirs(saved_path) unoptimized_model_saved_path = os.path.join( saved_path, '{}.onnx'.format(model_name)) optimized_model_saved_path = os.path.join( saved_path, '{}_optimized.onnx'.format(model_name)) self.sample_inputs = self.tokenizer.encode_plus( "This is a sample input", return_tensors='tf') # Step 1: Convert origin transformers model to unoptimized ONNX model. model.predict(self.sample_inputs.data) unoptimized_model = convert_keras(model, model.name, target_opset=self.target_opset) save_model(unoptimized_model, unoptimized_model_saved_path) # Step 2: optimizations for trained model converted from Tensorflow(tf.keras) optimized_model = optimizer.optimize_model( unoptimized_model_saved_path, model_type='bert_keras', num_heads=self.num_heads, hidden_size=self.hidden_size) optimized_model.save_model_to_file(optimized_model_saved_path) return optimized_model, optimized_model_saved_path
def tf_keras_convert_to_onnx(models, paths, config): """ 将keras模型转换为onnx :param models: :param paths: :param config: :return: """ onnxNerBert = keras2onnx.convert_keras(models, models.name, target_opset=12) keras2onnx.save_model(onnxNerBert, paths) optimized_model = optimizer.optimize_model( paths, model_type='bert_keras', num_heads=config.num_attention_heads, hidden_size=config.hidden_size) optimized_model.use_dynamic_axes() optimized_model.save_model_to_file(paths)
def export(): shutil.rmtree("onnx", ignore_errors=1) model = Classification.from_pretrained("model") model.base_model.save_pretrained("./bertBase") convert( framework="pt", model="bertBase", # CHANGED: refer to custom model tokenizer=get_tokenizer(), # <-- CHANGED: add tokenizer output=Path("onnx/bert-base-cased.onnx"), opset=12, ) # # Mixed precision conversion for bert-base-cased model converted from Pytorch optimized_model = optimizer.optimize_model( "onnx/bert-base-cased.onnx", # CHANGED: original `bert-base-cased.onnx` didn't point to right directory model_type="bert", num_heads=12, hidden_size=768, ) optimized_model.convert_model_float32_to_float16() optimized_model.save_model_to_file("onnx/bert-base-cased.onnx")
def convert_to_onnx(model: PreTrainedModel, output_path, opset: int = 12): onnx_output_path = os.path.join(output_path, 'checkpoint_without_optimize.onnx') onnx_optimized_output_path = os.path.join(output_path, 'checkpoint_with_optimize.onnx') onnx_optimized_fp16_output_path = os.path.join( output_path, 'checkpoint_with_optimize_fp16.onnx') model.eval() with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes( tmp_model, tmp_tokenizer) ordered_input_names, model_args = ensure_valid_input( model, tokens, input_names) print(f"Model input names: {ordered_input_names}.") export(model, model_args, onnx_output_path, input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, verbose=True, opset_version=opset) print( f"Finished output checkpoint_without_optimize.onnx to {output_path}." ) optimized_model = optimizer.optimize_model(onnx_output_path, model_type='bert', num_heads=12, hidden_size=768, use_gpu=True) optimized_model.save_model_to_file(onnx_optimized_output_path) print(f"Finished output checkpoint_with_optimize.onnx to {output_path}.") optimized_model.convert_model_float32_to_float16() optimized_model.save_model_to_file(onnx_optimized_fp16_output_path) print( f"Finished output checkpoint_with_optimize_fp16.onnx to {output_path}." )
def _build_onnxrt_session(model): # using https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers dummy_input = {'input_ids': torch.ones(1,128, dtype=torch.int64), 'attention_mask': torch.ones(1,128, dtype=torch.int64), 'token_type_ids': torch.ones(1,128, dtype=torch.int64)} symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} onnx_model_path = "/tmp/temp_turbo_onnx.model" onnx_opt_model_path = "/tmp/temp_turbo_onnx_opt.model" quantized_model_path = "/tmp/temp_turbo_onnx_q.model" # (1) export to onnx fp32 model with open(onnx_model_path, 'wb') as f: torch.onnx.export(model, (dummy_input['input_ids'], dummy_input['attention_mask'], dummy_input['token_type_ids']), f, input_names=['input_ids', 'attention_mask', 'token_type_ids'], output_names=['output'], opset_version=11, dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names, 'token_type_ids': symbolic_names}) # (2) optimize the fp32 model from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( onnx_model_path, 'bert', num_heads=model.config.num_attention_heads, hidden_size=model.config.hidden_size, optimization_options=opt_options) opt_model.save_model_to_file(onnx_opt_model_path) # (3) quantize the model from onnxruntime.quantization import quantize, QuantizationMode import onnx import onnxruntime import onnxruntime.backend opt_model = onnx.load(onnx_opt_model_path) quantized_onnx_model = quantize(opt_model, quantization_mode=QuantizationMode.IntegerOps, symmetric_weight=True, force_fusions=True) onnx.save(quantized_onnx_model, quantized_model_path) sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL return onnxruntime.InferenceSession(quantized_model_path, sess_options)
def convert_to_onnx(model_path, model_name, output_path): output_path = pathlib.Path(output_path) convert(framework="pt", model=model_path, tokenizer=model_name, output=output_path, opset=11) if model_name in ('gpt2',): # gpt2-medium not supported yet optimized_model = optimizer.optimize_model(output_path, model_type=model_name) optimized_model.save_model_to_file(output_path)
def export_model_to_onnx(self, fpath, quantize=False, target_opset=None, verbose=1): """ Export model to onnx Args: fpath(str): String representing full path to model file where ONNX model will be saved. Example: '/tmp/my_model.onnx' quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 1) ONNX model (created directly using keras2onnx 2) an optimized ONNX model (created by transformers library) 3) a quantized version of optimized ONNX model (created by transformers library) All files will be created in the parent folder of fpath: Example: If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and /tmp/model-optimized-quantized.onnx will also be created. verbose(bool): verbosity Returns: str: string representing fpath. If quantize=True, returned fpath will be different than supplied fpath """ try: import onnxruntime, onnxruntime_tools, onnx, keras2onnx except ImportError: raise Exception('This method requires ONNX libraries to be installed: '+\ 'pip install -q --upgrade onnxruntime==1.5.1 onnxruntime-tools onnx keras2onnx') from pathlib import Path if type(self.preproc).__name__ == 'BERTPreprocessor': raise Exception('currently_unsupported: BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\ 'Only BERT models created with Transformer(...) are supported.') if verbose: print( 'converting to ONNX format ... this may take a few moments...') if U.is_huggingface(model=self.model): tokenizer = self.preproc.get_tokenizer() maxlen = self.preproc.maxlen input_dict = tokenizer('Name', return_tensors='tf', padding='max_length', max_length=maxlen) if version.parse(tf.__version__) < version.parse('2.2'): raise Exception( 'export_model_to_tflite requires tensorflow>=2.2') #self.model._set_inputs(input_spec, training=False) # for tf < 2.2 self.model._saved_model_inputs_spec = None # for tf > 2.2 self.model._set_save_spec(input_dict) # for tf > 2.2 self.model._get_save_spec() onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset) keras2onnx.save_model(onnx_model, fpath) return_fpath = fpath if quantize: from transformers.convert_graph_to_onnx import optimize, quantize #opt_path = optimize(Path(fpath)) if U.is_huggingface(model=self.model) and\ type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']: try: from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( fpath, 'bert', # bert_keras causes error with transformers num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file(fpath) except: warnings.warn('Could not run BERT-specific optimizations') pass quantize_path = quantize(Path(fpath)) return_fpath = quantize_path.as_posix() if verbose: print('done.') return return_fpath
if __name__ == '__main__': args = get_args_from_command_line() #text = "tick tock tick" #convert_bert_to_onnx('tick tock', args.model_dir, args.onnx_model_path) #remove_initializer_from_input(args.onnx_model_path, args.onnx_model_path) convert(framework="pt", model=args.model_dir, tokenizer="DeepPavlov/bert-base-cased-conversational", output=args.onnx_model_path, opset=11) # ONNX optimization optimized_model = optimizer.optimize_model(args.onnx_model_path, model_type='bert', num_heads=12, hidden_size=768) optimized_onnx_model_path = os.path.join( os.path.dirname(args.onnx_model_path), 'bert_optimized.onnx') optimized_model.save_model_to_file(optimized_onnx_model_path) print('Optimized model saved at :', optimized_onnx_model_path) # ONNX quantization model = onnx.load(optimized_onnx_model_path) quantized_model = quantize(model, quantization_mode=QuantizationMode.IntegerOps, static=False) optimized_quantized_onnx_model_path = os.path.join( os.path.dirname(optimized_onnx_model_path), 'bert_optimized_quantized.onnx') onnx.save(quantized_model, optimized_quantized_onnx_model_path) print('Quantized&optimized model saved at :',
) # deleting onxx folder and contents, if exists, conversion excepts except: print('no existing folder, creating one') os.makedirs(onnx_path) print('>> converting..') convert(framework="pt", model=model_path, tokenizer=args.model_name, output=onnx_path + 'converted.onnx', opset=11) print('>> optimizing..') # ONNX optimization optimized_model = optimizer.optimize_model(onnx_path + '/converted.onnx', model_type=args.model_type, num_heads=12, hidden_size=768) optimized_onnx_model_path = os.path.join(onnx_path, 'bert_optimized.onnx') optimized_model.save_model_to_file(optimized_onnx_model_path) print('Optimized model saved at :', optimized_onnx_model_path) print('>> quantizing..') model = onnx.load(onnx_path + '/converted.onnx') quantized_model = quantize(model=model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True) optimized_quantized_onnx_model_path = os.path.join( os.path.dirname(optimized_onnx_model_path), 'ONNX_model_optimized_quantized.onnx')
def convert_to_onnx(cls, model_name, output_path, task_type, convert_to_float16=False, quantize=False, opset_version=11): """ Convert a PyTorch model from transformers hub to an ONNX Model. :param model_name: transformers model name :type model_name: str :param output_path: output Path to write the converted to :type output_path: Path :param task_type: Type of task for the model. Available options: "embeddings", "question_answering", "text_classification", "ner". :param convert_to_float16: By default, the model use float32 precision. With half precision of flaot16, inference should be faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, float32 might be more performant. :type convert_to_float16: bool :param quantize: convert floating point number to integers :type quantize: bool :param opset_version: ONNX opset version :type opset_version: int :return: """ language_model_class = LanguageModel.get_language_model_class( model_name) if language_model_class not in ["Bert", "Roberta", "XLMRoberta"]: raise Exception( "The current ONNX conversion only support 'BERT', 'RoBERTa', and 'XLMRoberta' models." ) task_type_to_pipeline_map = { "question_answering": "question-answering", "embeddings": "feature-extraction", "ner": "ner" } convert(pipeline_name=task_type_to_pipeline_map[task_type], framework="pt", model=model_name, output=output_path / "model.onnx", opset=opset_version, use_external_format=True if language_model_class is "XLMRoberta" else False) # save processor & model config files that are needed when loading the model with the FARM Inferencer processor = Processor.convert_from_transformers( tokenizer_name_or_path=model_name, task_type=task_type, max_seq_len=256, doc_stride=128, use_fast=True) processor.save(output_path) model = AdaptiveModel.convert_from_transformers(model_name, device="cpu", task_type=task_type) model.save(output_path) os.remove( output_path / "language_model.bin" ) # remove the actual PyTorch model(only configs are required) onnx_model_config = { "task_type": task_type, "onnx_opset_version": opset_version, "language_model_class": language_model_class, "language": model.language_model.language } with open(output_path / "onnx_model_config.json", "w") as f: json.dump(onnx_model_config, f) if convert_to_float16: from onnxruntime_tools import optimizer config = AutoConfig.from_pretrained(model_name) optimized_model = optimizer.optimize_model( input=str(output_path / "model.onnx"), model_type='bert', num_heads=config.num_hidden_layers, hidden_size=config.hidden_size) optimized_model.convert_model_float32_to_float16() optimized_model.save_model_to_file("model.onnx") if quantize: quantize_model(output_path / "model.onnx")
for ep in ep_list: dev = ep_dev_map.get(ep) dynamic = { "non_dynamic": export_model_path, "dynamic": export_model_path_dynamic } for k, v in dynamic.items(): print(k) # This will save the optimized graph to the directory specified in optimized_model_filepath sess_options.optimized_model_filepath = os.path.join( output_dir, "optimized_model_{}.onnx".format(dev)) # Optional: store the optimized graph and view it using Netron to verify that model is fully optimized. # Note that this will increase session creation time so enable it for debugging only. optimized_model = optimizer.optimize_model(v, model_type='bert', num_heads=12, hidden_size=768) optimized_model.save_model_to_file( sess_options.optimized_model_filepath) # Please change the value according to best setting in Performance Test Tool result. sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) session = onnxruntime.InferenceSession( sess_options.optimized_model_filepath, sess_options) session.set_providers([ep]) latency = [] for i in range(total_samples): data = dataset[i] # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance. ort_inputs = {
# # An optional step unless # # you want to get a model with mixed precision for perf accelartion on newer GPU # # or you are working with Tensorflow(tf.keras) models or pytorch models other than bert # !pip install onnxruntime-tools # from onnxruntime_tools import optimizer # # Mixed precision conversion for bert-base-cased model converted from Pytorch # optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert', num_heads=12, hidden_size=768) # optimized_model.convert_model_float32_to_float16() # optimized_model.save_model_to_file("bert-base-cased.onnx") # # optimizations for bert-base-cased model converted from Tensorflow(tf.keras) # optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert_keras', num_heads=12, hidden_size=768) # optimized_model.save_model_to_file("bert-base-cased.onnx") # optimize transformer-based models with onnxruntime-tools from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model('onnx/bert-base-cased.onnx', 'bert', num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file('onnx/bert.opt.onnx')