Exemple #1
0
    def test_attention_fusion(self):
        model = create_bert_attention()
        dir = '.'
        model_path = os.path.join(dir, "attention.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        expected_model_path = os.path.join(os.path.dirname(__file__),
                                           'test_data', 'models',
                                           'attention_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
 def test_gpt2_past_mask(self):
     input = _get_test_model_path('gpt2_past_mask')
     model = optimize_model(input, 'gpt2', num_heads=2, hidden_size=4)
     expected_node_count = {
         'EmbedLayerNormalization': 0,
         'Attention': 1,
         'Gelu': 0,
         'FastGelu': 1,
         'BiasGelu': 0,
         'LayerNormalization': 2,
         'SkipLayerNormalization': 0
     }
     self.verify_node_count(model, expected_node_count,
                            'test_gpt2_past_mask')
Exemple #3
0
    def test_3d_attention_fusion_tf2onnx_model(self):
        model = create_tf2onnx_attention_3d()
        dir = '.'
        model_path = os.path.join(dir, 'bert_3d_attention.onnx')
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path,
                                         model_type='bert_tf',
                                         num_heads=4,
                                         hidden_size=16)
        os.remove(model_path)

        expected_model_path = os.path.join(os.path.dirname(__file__),
                                           'test_data', 'models',
                                           'bert_3d_attention_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
Exemple #4
0
    def test_attention_fusion_for_varied_qkv_dimensions(self):
        model = create_bert_attention(input_hidden_size=16,
                                      num_heads=2,
                                      pruned_qk_hidden_size=24,
                                      pruned_v_hidden_size=16)
        dir = '.'
        model_path = os.path.join(dir, "attention_with_varied_qkv.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        expected_model_path = os.path.join(
            os.path.dirname(__file__), 'test_data', 'models',
            'attention_with_varied_qkv_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
 def test_multiple_embed(self):
     input_model_path = _get_test_model_path('multiple_embed')
     model = optimize_model(input_model_path,
                            'bert',
                            num_heads=2,
                            hidden_size=4)
     expected_node_count = {
         'EmbedLayerNormalization': 2,
         'Attention': 2,
         'Gelu': 0,
         'FastGelu': 0,
         'BiasGelu': 0,
         'LayerNormalization': 0,
         'SkipLayerNormalization': 0
     }
     self.verify_node_count(model, expected_node_count,
                            'test_multiple_embed')
Exemple #6
0
    def test_attention_fusion_reverse_add_order(self):
        model = create_bert_attention(input_hidden_size=16,
                                      num_heads=2,
                                      pruned_qk_hidden_size=8,
                                      pruned_v_hidden_size=8,
                                      switch_add_inputs=True)
        dir = '.'
        model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        # reverse add input order will get same optimized model
        expected_model_path = os.path.join(os.path.dirname(__file__),
                                           'test_data', 'models',
                                           'pruned_attention_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
    def test_fusions(self):
        sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
        from onnxruntime.transformers.optimizer import optimize_model

        for test_case in test_cases:
            source, operator, model_class = test_case
            model = model_class()
            dummy_input = torch.ones(3, dtype=torch.float32)
            test_name = f"{operator}_{source}"
            onnx_path = f"{test_name}.onnx"
            torch.onnx.export(model, (dummy_input),
                              onnx_path,
                              input_names=['input'],
                              output_names=['output'])
            optimizer = optimize_model(onnx_path, 'bert')
            # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx")
            os.remove(onnx_path)
            expected_node_count = {operator: 1}
            self.verify_node_count(optimizer, expected_node_count, test_name)
Exemple #8
0
                                      dummy_dataloader,
                                      benchmark=True)
            latency = np.array(results).mean() / args.eval_batch_size

            print('Latency: {:.3f} ms'.format(latency * 1000))
            print('Throughput: {:.3f} items/sec'.format(args.eval_batch_size *
                                                        1. / latency))
        print('--------------------------------------------------------------')

    if args.tune:
        from onnxruntime.transformers import optimizer
        from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
        opt_options = BertOptimizationOptions('bert')
        opt_options.enable_embed_layer_norm = False

        model_optimizer = optimizer.optimize_model(
            args.model_path,
            'bert',
            num_heads=12,
            hidden_size=768,
            optimization_options=opt_options)
        model = model_optimizer.model

        from lpot.experimental import Quantization, common
        quantize = Quantization(args.config)
        quantize.model = common.Model(model)
        quantize.calib_dataloader = eval_dataloader
        quantize.eval_func = eval_func
        q_model = quantize()
        q_model.save(args.output_model)
Exemple #9
0
    def export_model_to_onnx(self,
                             fpath,
                             quantize=False,
                             target_opset=None,
                             verbose=1):
        """
        ```
        Export model to onnx
        Args:
          fpath(str): String representing full path to model file where ONNX model will be saved.
                      Example: '/tmp/my_model.onnx'
          quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 
                         1) ONNX model  (created directly using keras2onnx
                         2) an optimized ONNX model (created by transformers library)
                         3) a quantized version of optimized ONNX model (created by transformers library)
                         All files will be created in the parent folder of fpath:
                         Example: 
                           If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and
                           /tmp/model-optimized-quantized.onnx will also be created.
          verbose(bool): verbosity
        Returns:
          str: string representing fpath.  If quantize=True, returned fpath will be different than supplied fpath
        ```
        """
        try:
            import onnxruntime, onnx
        except ImportError:
            raise Exception('This method requires ONNX libraries to be installed: '+\
                            'pip install -q --upgrade onnxruntime==1.10.0 onnx sympy tf2onnx')
        from pathlib import Path
        if type(self.preproc).__name__ == 'BERTPreprocessor':
            raise Exception('currently_unsupported:  BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\
                            'Only BERT models created with Transformer(...) are supported.')

        if verbose:
            print(
                'converting to ONNX format by way of TFLite ... this may take a few moments...'
            )
        if U.is_huggingface(model=self.model):
            tokenizer = self.preproc.get_tokenizer()
            maxlen = self.preproc.maxlen
            input_dict = tokenizer('Name',
                                   return_tensors='tf',
                                   padding='max_length',
                                   max_length=maxlen)

            if version.parse(tf.__version__) < version.parse('2.2'):
                raise Exception(
                    'export_model_to_tflite requires tensorflow>=2.2')
                #self.model._set_inputs(input_spec, training=False) # for tf < 2.2
            self.model._saved_model_inputs_spec = None  # for tf > 2.2
            self.model._set_save_spec(input_dict)  # for tf > 2.2
            self.model._get_save_spec()

        #onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset)
        #keras2onnx.save_model(onnx_model, fpath)
        tflite_model_path = self.export_model_to_tflite(fpath + '-TFLITE_TMP',
                                                        verbose=verbose)

        import subprocess
        if verbose: print('converting to ONNX using tf2onnx...')
        proc = subprocess.run(
            f'python -m tf2onnx.convert --tflite {tflite_model_path} --output {fpath}'
            .split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        if verbose:
            print(proc.returncode)
            print(proc.stdout.decode('ascii'))
            print(proc.stderr.decode('ascii'))
        return_fpath = fpath

        if quantize:
            from transformers.convert_graph_to_onnx import optimize, quantize
            #opt_path = optimize(Path(fpath))

            if U.is_huggingface(model=self.model) and\
               type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']:
                try:
                    from onnxruntime.transformers import optimizer
                    from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
                    # disable embedding layer norm optimization for better model size reduction
                    opt_options = BertOptimizationOptions('bert')
                    opt_options.enable_embed_layer_norm = False
                    opt_model = optimizer.optimize_model(
                        fpath,
                        'bert',  # bert_keras causes error with transformers
                        num_heads=12,
                        hidden_size=768,
                        optimization_options=opt_options)
                    opt_model.save_model_to_file(fpath)
                except:
                    warnings.warn('Could not run BERT-specific optimizations')
                    pass
            quantize_path = quantize(Path(fpath))
            return_fpath = quantize_path.as_posix()
        if verbose: print('done.')
        return return_fpath