Ejemplo n.º 1
0
def optimize_onnx_model(onnx_model_filename, model_type, num_attention_heads,
                        hidden_size, fp16):
    optimized_model_filename = onnx_model_filename.replace(
        ".onnx", "_fp16.onnx" if fp16 else "_fp32.onnx")
    if not os.path.exists(optimized_model_filename):
        import bert_model_optimization as bert_opt
        # Use onnxruntime to optimize model, which will be saved to *_ort_cpu.onnx
        opt_model = bert_opt.optimize_model(onnx_model_filename,
                                            model_type,
                                            num_heads=num_attention_heads,
                                            hidden_size=hidden_size,
                                            opt_level=99,
                                            only_onnxruntime=True)
        optimize_model_statistics[
            onnx_model_filename] = opt_model.get_fused_operator_statistics()

        # Use script to optimize model.
        opt_model = bert_opt.optimize_model(onnx_model_filename,
                                            model_type,
                                            num_heads=num_attention_heads,
                                            hidden_size=hidden_size,
                                            opt_level=0)
        optimize_model_statistics[
            optimized_model_filename] = opt_model.get_fused_operator_statistics(
            )

        if fp16:
            opt_model.convert_model_float32_to_float16()
        opt_model.save_model_to_file(optimized_model_filename)
    else:
        logger.info(
            f"Skip optimization since model existed: {optimized_model_filename}"
        )
    return optimized_model_filename
    def test_pytorch_model_0_gpu(self):
        if 'CUDAExecutionProvider' not in onnxruntime.get_available_providers(
        ):
            print("skip test_pytorch_model_0_gpu since no gpu found")
            return

        input = BERT_TEST_MODELS['bert_pytorch_0']
        bert_model = optimize_model(input,
                                    'bert',
                                    gpu_only=True,
                                    num_heads=2,
                                    hidden_size=8,
                                    sequence_length=10,
                                    input_int32=False,
                                    float16=False)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'FastGelu': 12,
            'Gelu': 0,
            'BiasGelu': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
Ejemplo n.º 3
0
    def test_tensorflow_model_1_cpu(self):
        input = self.get_model("tensorflow", 1)

        # The model need constant folding. Use onnxruntime to do so for now.
        temp = 'temp.onnx'
        run_onnxruntime(input, use_gpu=False, optimized_model_path=temp)

        bert_model = optimize_model(temp,
                                    framework='tensorflow',
                                    gpu_only=False,
                                    num_heads=2,
                                    hidden_size=8,
                                    sequence_length=7,
                                    input_int32=False,
                                    float16=False,
                                    verbose=False)
        os.remove(temp)

        # Optimization for tensorflow model is still on-going.
        # TODO: update this after code complete.
        expected_node_count = {
            'EmbedLayerNormalization': 0,
            'Attention': 0,
            'LayerNormalization': 0,
            'SkipLayerNormalization': 25,
            'BiasGelu': 0,
            'Gelu': 12,
            'FastGelu': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
Ejemplo n.º 4
0
    def test_keras_squad_model(self):
        input = BERT_TEST_MODELS['bert_keras_squad']

        bert_model = optimize_model(input,
                                    'bert_keras',
                                    num_heads=2,
                                    hidden_size=8)

        self.assertTrue(bert_model.is_fully_optimized())
 def test_pytorch_model_2_cpu(self):
     input = BERT_TEST_MODELS['bert_squad_pytorch1.4_opset10_fp32']
     bert_model = optimize_model(input,
                                 'bert',
                                 gpu_only=False,
                                 num_heads=2,
                                 hidden_size=8,
                                 sequence_length=10,
                                 input_int32=False,
                                 float16=False)
     self.assertTrue(bert_model.is_fully_optimized())
    def test_keras_squad_model_cpu(self):
        input = BERT_TEST_MODELS['bert_keras_squad']

        bert_model = optimize_model(input,
                                    'bert_keras',
                                    gpu_only=False,
                                    num_heads=2,
                                    hidden_size=8,
                                    sequence_length=7,
                                    input_int32=False,
                                    float16=False)

        self.assertTrue(bert_model.is_fully_optimized())
Ejemplo n.º 7
0
    def test_pytorch_model_0(self):
        input = BERT_TEST_MODELS['bert_pytorch_0']
        bert_model = optimize_model(input, 'bert', num_heads=2, hidden_size=8)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'Gelu': 0,
            'FastGelu': 0,
            'BiasGelu': 12
        }
        self.verify_node_count(bert_model, expected_node_count)
Ejemplo n.º 8
0
    def test_gpt2(self):
        input = BERT_TEST_MODELS['gpt2']
        bert_model = optimize_model(input, 'gpt2', num_heads=2, hidden_size=4)

        expected_node_count = {
            'EmbedLayerNormalization': 0,
            'Attention': 12,
            'Gelu': 0,
            'FastGelu': 12,
            'BiasGelu': 0,
            'LayerNormalization': 25,
            'SkipLayerNormalization': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
Ejemplo n.º 9
0
    def test_pytorch_model_0_cpu(self):
        input = BERT_TEST_MODELS['bert_pytorch_0']
        bert_model = optimize_model(input, 'bert', gpu_only=False,
                                    num_heads=2, hidden_size=8, sequence_length=10,
                                    input_int32=False, float16=False)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'Gelu': 0,
            'FastGelu': 0,
            'BiasGelu': 12
            }
        self.verify_node_count(bert_model, expected_node_count)
    def test_gpt2(self):
        input = BERT_TEST_MODELS['gpt2']
        bert_model = optimize_model(input,
                                    'gpt2',
                                    gpu_only=False,
                                    num_heads=2,
                                    hidden_size=4,
                                    sequence_length=2,
                                    input_int32=False,
                                    float16=False)

        expected_node_count = {
            'EmbedLayerNormalization': 0,
            'Attention': 12,
            'Gelu': 0,
            'FastGelu': 12,
            'BiasGelu': 0,
            'LayerNormalization': 25,
            'SkipLayerNormalization': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
Ejemplo n.º 11
0
    def test_pytorch_model_0_cpu(self):
        input = self.get_model("pytorch", 0)
        bert_model = optimize_model(input,
                                    framework='pytorch',
                                    gpu_only=False,
                                    num_heads=2,
                                    hidden_size=8,
                                    sequence_length=10,
                                    input_int32=False,
                                    float16=False,
                                    verbose=False)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'Gelu': 12,
            'FastGelu': 0,
            'BiasGelu': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
Ejemplo n.º 12
0
 def test_pytorch_model_2(self):
     input = BERT_TEST_MODELS['bert_squad_pytorch1.4_opset10_fp32']
     bert_model = optimize_model(input, 'bert', num_heads=2, hidden_size=8)
     self.assertTrue(bert_model.is_fully_optimized())
Ejemplo n.º 13
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)
    dump_environment()

    enable_past_input = args.enable_past_input

    cache_dir = args.cache_dir
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    output_dir = args.output_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    (model_class, tokenizer_class,
     model_name_or_path) = MODEL_CLASSES[args.model_type]

    tokenizer = tokenizer_class.from_pretrained(model_name_or_path,
                                                cache_dir=cache_dir)
    model = model_class.from_pretrained(model_name_or_path,
                                        cache_dir=cache_dir)
    model.eval().cpu()

    inputs = tokenizer.encode_plus("Here is an example input for GPT2 model",
                                   add_special_tokens=True,
                                   return_tensors='pt')
    input_ids = inputs['input_ids']
    outputs = model(input_ids=input_ids, past=None)

    num_layer = model.config.n_layer
    present_names = [f'present_{i}' for i in range(num_layer)]
    output_names = ["last_state"] + present_names

    input_names = ['input_ids']
    dynamic_axes = {
        'input_ids': {
            0: 'batch_size',
            1: 'seq_len'
        },
        'last_state': {
            0: 'batch_size',
            1: 'seq_len'
        }
    }
    for name in present_names:
        dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}

    if enable_past_input:
        past_names = [f'past_{i}' for i in range(num_layer)]
        input_names = ['input_ids'] + past_names
        dummy_past = [
            torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer)
        ]
        for name in past_names:
            dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
        export_inputs = (inputs['input_ids'], tuple(dummy_past))
    else:
        export_inputs = (inputs['input_ids'])

    export_model_path = os.path.join(
        output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input)))

    torch.onnx.export(model,
                      args=export_inputs,
                      f=export_model_path,
                      input_names=input_names,
                      output_names=output_names,
                      dynamic_axes=dynamic_axes,
                      opset_version=11,
                      do_constant_folding=True,
                      verbose=False)

    # Let's run performance test on PyTorch before updating environment variable.
    past = dummy_past if enable_past_input else None
    outputs = pytorch_inference(model,
                                input_ids,
                                past,
                                total_runs=args.total_runs)

    # setup environment variables before importing onnxruntime.
    setup_environment(args.use_openmp)
    import onnxruntime

    onnx_model_path = export_model_path if enable_past_input else remove_past_outputs(
        export_model_path)

    if args.enable_optimization:
        from bert_model_optimization import optimize_model
        m = optimize_model(onnx_model_path,
                           model_type='gpt2',
                           gpu_only=False,
                           num_heads=12,
                           hidden_size=768,
                           sequence_length=64,
                           input_int32=False,
                           float16=False,
                           opt_level=0)
        onnx_model_path = os.path.join(
            output_dir,
            'gpt2_past{}_optimized.onnx'.format(int(enable_past_input)))
        m.save_model_to_file(onnx_model_path)

    if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
        logger.warning(
            "onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference."
        )

    sess_options = onnxruntime.SessionOptions()

    if args.use_openmp:
        sess_options.intra_op_num_threads = 1
    else:
        sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
    logger.info(
        f"session option: intra_op_num_threads={sess_options.intra_op_num_threads}"
    )

    logger.info(f"Start inferencing onnx model: {onnx_model_path}")
    session = onnxruntime.InferenceSession(onnx_model_path,
                                           sess_options,
                                           providers=['CPUExecutionProvider'])

    ort_outputs = onnxruntime_inference(session, input_ids, past,
                                        args.total_runs)
    if args.verify_outputs:
        logger.info(
            'PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(
                0),
            numpy.allclose(ort_outputs[0],
                           outputs[0].cpu(),
                           rtol=1e-05,
                           atol=1e-04))

        for layer in range(model.config.n_layer):
            logger.info(
                'PyTorch and OnnxRuntime layer {} state (present_{}) are close:'
                .format(layer, layer),
                numpy.allclose(ort_outputs[1 + layer],
                               outputs[1][layer].cpu(),
                               rtol=1e-05,
                               atol=1e-04))