Python quantize Examples, transformers.convert_graph_to_onnx.quantize Python Examples

Example #1

0

Show file

File: model.py Project: airKlizz/benchmark-for-transformers

    def __init__(
        self,
        model_name,
        pipeline_name,
        model_path=None,
        device="cpu",
        quantization=False,
        opset=11,
        force=False,
        **convert_kwargs
    ):
        if model_path == None:
            model_path = f"onnx/{model_name}.onnx"
        model_path = Path(model_path)

        if not model_path.is_file() or force:
            convert(
                framework="pt",
                model=model_name,
                output=model_path,
                opset=opset,
                pipeline_name=pipeline_name,
                **convert_kwargs,
            )

        if quantization:
            model_path = optimize(model_path)
            model_path = quantize(model_path)

        self.model_path = str(model_path)
        self.provider = "CPUExecutionProvider" if device == "cpu" else "CUDAExecutionProvider"
        self.session = self.create_model_for_provider()
        self.config = AutoConfig.from_pretrained(model_name)

Example #2

0

Show file

    def test_quantize_pytorch(self):
        for model in OnnxExportTestCase.MODEL_TO_TEST:
            path = self._test_export(model, "pt", 12)
            quantized_path = quantize(path)

            # Ensure the actual quantized model is not bigger than the original one
            if quantized_path.stat().st_size >= Path(path).stat().st_size:
                self.fail("Quantized model is bigger than initial ONNX model")

Example #3

0

Show file

File: test_onnx.py Project: xplip/adapter-transformers

    def test_quantize_tf(self):
        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
            path = self._test_export(model, "tf", 12, **model_kwargs)
            quantized_path = quantize(Path(path))

            # Ensure the actual quantized model is not bigger than the original one
            if quantized_path.stat().st_size >= Path(path).stat().st_size:
                self.fail("Quantized model is bigger than initial ONNX model")

Example #4

0

Show file

File: bert_trainer.py Project: thousandvoices/ok_ml_cup

    def _save_impl(self, path, export_type=None):
        path = Path(path)
        self._save_gpu(path)

        if export_type == 'cpu':
            with TemporaryDirectory() as temp_dir:
                temp_model_path = Path(temp_dir) / 'temp.onnx'
                convert(framework='pt',
                        model=str(path),
                        output=temp_model_path,
                        pipeline_name='sentiment-analysis',
                        opset=11)
                optimized_path = optimize(temp_model_path)
                quantized_path = quantize(optimized_path)
                target_path = BertCpuClassifier.onnx_model_path(path)
                with open(quantized_path,
                          'rb') as src, gzip.open(target_path, 'wb') as dst:
                    shutil.copyfileobj(src, dst)
                os.remove(path / 'pytorch_model.bin')

Example #5

0

Show file

File: generate_question_answering.py Project: pranavrajs/informers

from pathlib import Path
import tempfile
from transformers.convert_graph_to_onnx import convert, quantize

dest = Path(tempfile.mkdtemp(), "question-answering.onnx")
convert(pipeline_name="question-answering",
        model="distilbert-base-cased-distilled-squad",
        output=dest,
        framework="pt",
        opset=11)
quantize(dest)

Example #6

0

Show file

    def export_model_to_onnx(self,
                             fpath,
                             quantize=False,
                             target_opset=None,
                             verbose=1):
        """
        Export model to onnx
        Args:
          fpath(str): String representing full path to model file where ONNX model will be saved.
                      Example: '/tmp/my_model.onnx'
          quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 
                         1) ONNX model  (created directly using keras2onnx
                         2) an optimized ONNX model (created by transformers library)
                         3) a quantized version of optimized ONNX model (created by transformers library)
                         All files will be created in the parent folder of fpath:
                         Example: 
                           If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and
                           /tmp/model-optimized-quantized.onnx will also be created.
          verbose(bool): verbosity
        Returns:
          str: string representing fpath.  If quantize=True, returned fpath will be different than supplied fpath
        """
        try:
            import onnxruntime, onnxruntime_tools, onnx, keras2onnx
        except ImportError:
            raise Exception('This method requires ONNX libraries to be installed: '+\
                            'pip install -q --upgrade onnxruntime==1.5.1 onnxruntime-tools onnx keras2onnx')
        from pathlib import Path
        if type(self.preproc).__name__ == 'BERTPreprocessor':
            raise Exception('currently_unsupported:  BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\
                            'Only BERT models created with Transformer(...) are supported.')

        if verbose:
            print(
                'converting to ONNX format ... this may take a few moments...')
        if U.is_huggingface(model=self.model):
            tokenizer = self.preproc.get_tokenizer()
            maxlen = self.preproc.maxlen
            input_dict = tokenizer('Name',
                                   return_tensors='tf',
                                   padding='max_length',
                                   max_length=maxlen)

            if version.parse(tf.__version__) < version.parse('2.2'):
                raise Exception(
                    'export_model_to_tflite requires tensorflow>=2.2')
                #self.model._set_inputs(input_spec, training=False) # for tf < 2.2
            self.model._saved_model_inputs_spec = None  # for tf > 2.2
            self.model._set_save_spec(input_dict)  # for tf > 2.2
            self.model._get_save_spec()

        onnx_model = keras2onnx.convert_keras(self.model,
                                              self.model.name,
                                              target_opset=target_opset)
        keras2onnx.save_model(onnx_model, fpath)
        return_fpath = fpath

        if quantize:
            from transformers.convert_graph_to_onnx import optimize, quantize
            #opt_path = optimize(Path(fpath))

            if U.is_huggingface(model=self.model) and\
               type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']:
                try:
                    from onnxruntime_tools import optimizer
                    from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
                    # disable embedding layer norm optimization for better model size reduction
                    opt_options = BertOptimizationOptions('bert')
                    opt_options.enable_embed_layer_norm = False
                    opt_model = optimizer.optimize_model(
                        fpath,
                        'bert',  # bert_keras causes error with transformers
                        num_heads=12,
                        hidden_size=768,
                        optimization_options=opt_options)
                    opt_model.save_model_to_file(fpath)
                except:
                    warnings.warn('Could not run BERT-specific optimizations')
                    pass
            quantize_path = quantize(Path(fpath))
            return_fpath = quantize_path.as_posix()
        if verbose: print('done.')
        return return_fpath

Example #7

0

Show file

            args.country_code][f'iter{str(args.iteration_number)}'][label],
        label, 'models', 'best_model')
    onnx_path = os.path.join(model_path, 'onnx')

    try:
        shutil.rmtree(
            onnx_path
        )  # deleting onxx folder and contents, if exists, conversion excepts
    except:
        logger.info('no existing folder, creating one')
        os.makedirs(onnx_path)

    logger.info('>> converting..')
    convert(framework="pt",
            model=model_path,
            tokenizer=convert_model_path_to_model_name(model_path),
            output=Path(os.path.join(onnx_path, 'converted.onnx')),
            opset=11,
            pipeline_name='sentiment-analysis')

    logger.info('>> ONNX optimization')
    optimized_output = optimize(Path(os.path.join(onnx_path,
                                                  'converted.onnx')))
    logger.info('>> Quantization')
    quantized_output = quantize(optimized_output)

    logger.info('>> Verification')
    verify(Path(os.path.join(onnx_path, 'converted.onnx')))
    verify(optimized_output)
    verify(quantized_output)

Example #8

0

Show file

time_buffer = []
for _ in trange(100):
    with track_infer_time(time_buffer):
        model_pt_quantized(**model_inputs)

results["PyTorch CPU Quantized"] = OnnxInferenceResult(time_buffer, None)
# print(results)


"""""""""""""""""""""""""""""""""""
Benchmarking ONNX quantized model
"""""""""""""""""""""""""""""""""""
from transformers.convert_graph_to_onnx import quantize

# Transformers allow you to easily convert float32 model to quantize int8 with ONNX Runtime
quantized_model_path = quantize(Path("onnx/bert.opt.onnx"))

# Then you just have to load through ONNX runtime as you would normally do
quantized_model = create_model_for_provider(quantized_model_path.as_posix(), "CPUExecutionProvider")

# Warm up the overall model to have a fair comparision
outputs = quantized_model.run(None, inputs_onnx)

# Evaluate performances
time_buffer = []
for _ in trange(100, desc=f"Tracking inference time on CPUExecutionProvider with quantized model"):
    with track_infer_time(time_buffer):
        outputs = quantized_model.run(None, inputs_onnx)

# Store the result
results["ONNX CPU Quantized"] = OnnxInferenceResult(time_buffer, quantized_model_path)

Example #9

0

Show file

File: predictor.py Project: Niekvdplas/ktrain

    def export_model_to_onnx(self, fpath, quantize=False, target_opset=None, verbose=1):
        """
        ```
        Export model to onnx
        Args:
          fpath(str): String representing full path to model file where ONNX model will be saved.
                      Example: '/tmp/my_model.onnx'
          quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx:
                         1) ONNX model  (created directly using keras2onnx
                         2) an optimized ONNX model (created by transformers library)
                         3) a quantized version of optimized ONNX model (created by transformers library)
                         All files will be created in the parent folder of fpath:
                         Example:
                           If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and
                           /tmp/model-optimized-quantized.onnx will also be created.
          verbose(bool): verbosity
        Returns:
          str: string representing fpath.  If quantize=True, returned fpath will be different than supplied fpath
        ```
        """
        try:
            import onnx
            import onnxruntime
        except ImportError:
            raise Exception(
                "This method requires ONNX libraries to be installed: "
                + "pip install -q --upgrade onnxruntime==1.10.0 onnx sympy tf2onnx"
            )
        from pathlib import Path

        if type(self.preproc).__name__ == "BERTPreprocessor":
            raise Exception(
                'currently_unsupported:  BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). '
                + "Only BERT models created with Transformer(...) are supported."
            )

        if verbose:
            print(
                "converting to ONNX format by way of TFLite ... this may take a few moments..."
            )
        if U.is_huggingface(model=self.model):
            tokenizer = self.preproc.get_tokenizer()
            maxlen = self.preproc.maxlen
            input_dict = tokenizer(
                "Name", return_tensors="tf", padding="max_length", max_length=maxlen
            )

            if version.parse(tf.__version__) < version.parse("2.2"):
                raise Exception("export_model_to_tflite requires tensorflow>=2.2")
                # self.model._set_inputs(input_spec, training=False) # for tf < 2.2
            self.model._saved_model_inputs_spec = None  # for tf > 2.2
            self.model._set_save_spec(input_dict)  # for tf > 2.2
            self.model._get_save_spec()

        # onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset)
        # keras2onnx.save_model(onnx_model, fpath)
        tflite_model_path = self.export_model_to_tflite(
            fpath + "-TFLITE_TMP", verbose=verbose
        )

        import subprocess

        if verbose:
            print("converting to ONNX using tf2onnx...")
        proc = subprocess.run(
            f"python -m tf2onnx.convert --tflite {tflite_model_path} --output {fpath}".split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if verbose:
            print(proc.returncode)
            print(proc.stdout.decode("ascii"))
            print(proc.stderr.decode("ascii"))
        return_fpath = fpath

        if quantize:
            from transformers.convert_graph_to_onnx import optimize, quantize

            # opt_path = optimize(Path(fpath))

            if U.is_huggingface(model=self.model) and type(self.model).__name__ in [
                "TFDistilBertForSequenceClassification",
                "TFBertForSequenceClassification",
            ]:
                try:
                    from onnxruntime.transformers import optimizer
                    from onnxruntime.transformers.onnx_model_bert import (
                        BertOptimizationOptions,
                    )

                    # disable embedding layer norm optimization for better model size reduction
                    opt_options = BertOptimizationOptions("bert")
                    opt_options.enable_embed_layer_norm = False
                    opt_model = optimizer.optimize_model(
                        fpath,
                        "bert",  # bert_keras causes error with transformers
                        num_heads=12,
                        hidden_size=768,
                        optimization_options=opt_options,
                    )
                    opt_model.save_model_to_file(fpath)
                except:
                    warnings.warn("Could not run BERT-specific optimizations")
                    pass
            quantize_path = quantize(Path(fpath))
            return_fpath = quantize_path.as_posix()
        if verbose:
            print("done.")
        return return_fpath