Ejemplo n.º 1
0
    def _load_onnx(self, model_name):
        input_names = MODELS[model_name][0]
        base_path = "../onnx_models/"

        config_modifier = ConfigModifier(None)
        fusion_options = None
        model_class = "AutoModel"
        with torch.no_grad():
            export_onnx_model_from_pt(
                model_name,
                MODELS[model_name][1],
                MODELS[model_name][2],
                MODELS[model_name][3],
                model_class,
                config_modifier,
                "../cache_models",
                base_path,
                input_names[:1],
                False,
                Precision.FLOAT32,
                OptimizerInfo.BYSCRIPT,
                True,
                True,
                False,
                {},
                fusion_options,
            )
        model_path = base_path + model_name.replace("-", "_") + "_1.onnx"
        return onnx.load_model(model_path)
    def _load_onnx(self, model_name):
        input_names = MODELS[model_name][0]
        base_path = "../onnx_models/"
        import torch

        with torch.no_grad():
            export_onnx_model_from_pt(
                model_name,
                MODELS[model_name][1],
                MODELS[model_name][2],
                MODELS[model_name][3],
                None,
                "../cache_models",
                base_path,
                input_names[:1],
                False,
                Precision.FLOAT32,
                OptimizerInfo.BYSCRIPT,
                True,
                True,
                False,
                {},
            )
        model_path = base_path + model_name.replace("-", "_") + "_1.onnx"
        import onnx

        return onnx.load_model(model_path)
Ejemplo n.º 3
0
 def _load_onnx(self, model_name):
     input_names = MODELS[model_name][0]
     base_path = "../onnx_models/"
     import torch
     with torch.no_grad():
         export_onnx_model_from_pt(
             model_name, MODELS[model_name][1], MODELS[model_name][2],
             MODELS[model_name][3], None, '../cache_models', base_path,
             input_names[:1], False, Precision.FLOAT32, True, True, True,
             False, {})
     model_path = base_path + model_name.replace('-', '_') + "_1.onnx"
     import onnx
     return onnx.load_model(model_path)
Ejemplo n.º 4
0
    def _test_optimizer_on_huggingface_model(self,
                                             model_name,
                                             expected_fusion_result_list,
                                             inputs_count=1,
                                             validate_model=True):
        # Remove cached model so that CI machine will have space
        import shutil
        shutil.rmtree('./cache_models', ignore_errors=True)
        shutil.rmtree('./onnx_models', ignore_errors=True)
        # expect fusion result list have the following keys
        # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization
        model_fusion_statistics = {}

        input_names = MODELS[model_name][0]

        import torch
        with torch.no_grad():
            _, is_valid_onnx_model, _, _ = export_onnx_model_from_pt(model_name, MODELS[model_name][1],
                                                                     MODELS[model_name][2], MODELS[model_name][3], None,
                                                                     './cache_models', './onnx_models',
                                                                     input_names[:inputs_count], False,
                                                                     Precision.FLOAT32, True, True, True, True,
                                                                     model_fusion_statistics)

        onnx_model = list(model_fusion_statistics.keys())[0]
        fusion_result_list = list(model_fusion_statistics[onnx_model].values())

        if validate_model:
            self.assertEqual(is_valid_onnx_model, True)
        self.assertEqual(fusion_result_list, expected_fusion_result_list)
Ejemplo n.º 5
0
    def _test_optimizer_on_huggingface_model(self,
                                             model_name,
                                             expected_fusion_result_list,
                                             inputs_count=1,
                                             validate_model=True):
        # expect fusion result list have the following keys
        # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization
        model_fusion_statistics = {}
        from onnx_exporter import export_onnx_model_from_pt
        from huggingface_models import MODELS
        from benchmark_helper import Precision

        input_names = MODELS[model_name][0]

        import torch
        with torch.no_grad():
            _, is_valid_onnx_model, _, _ = export_onnx_model_from_pt(
                model_name, MODELS[model_name][1], MODELS[model_name][2],
                MODELS[model_name][3], None, './cache_models', './onnx_models',
                input_names[:inputs_count], False, Precision.FLOAT32, True,
                True, True, True, model_fusion_statistics)

        onnx_model = list(model_fusion_statistics.keys())[0]
        fusion_result_list = list(model_fusion_statistics[onnx_model].values())

        if validate_model:
            self.assertEqual(is_valid_onnx_model, True)
        self.assertEqual(fusion_result_list, expected_fusion_result_list)
Ejemplo n.º 6
0
    def _test_optimizer_on_huggingface_model(
        self,
        model_name,
        expected_fusion_result_list,
        inputs_count=1,
        validate_model=True,
    ):
        # Remove cached model so that CI machine will have space
        shutil.rmtree("./cache_models", ignore_errors=True)
        shutil.rmtree("./onnx_models", ignore_errors=True)
        # expect fusion result list have the following keys
        # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization
        model_fusion_statistics = {}

        input_names = MODELS[model_name][0]

        config_modifier = ConfigModifier(None)
        fusion_options = None
        model_class = "AutoModel"
        with torch.no_grad():
            _, is_valid_onnx_model, _, _ = export_onnx_model_from_pt(
                model_name,
                MODELS[model_name][1],  # opset version
                MODELS[model_name][2],  # use_external_data_format
                MODELS[model_name][3],  # optimization model type
                model_class,
                config_modifier,
                "./cache_models",
                "./onnx_models",
                input_names[:inputs_count],
                False,
                Precision.FLOAT32,
                OptimizerInfo.BYSCRIPT,
                True,
                True,
                True,
                model_fusion_statistics,
                fusion_options,
            )

        onnx_model = list(model_fusion_statistics.keys())[0]
        fusion_result_list = list(model_fusion_statistics[onnx_model].values())

        if validate_model:
            self.assertEqual(is_valid_onnx_model, True)
        self.assertEqual(fusion_result_list, expected_fusion_result_list)
Ejemplo n.º 7
0
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads,
                    batch_sizes, sequence_lengths, repeat_times, input_counts,
                    optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose,
                    overwrite, disable_ort_io_binding, use_raw_attention_mask,
                    model_fusion_statistics, model_source):
    import onnxruntime

    results = []
    if use_gpu and ('CUDAExecutionProvider'
                    not in onnxruntime.get_available_providers()):
        logger.error(
            "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
        return results

    if (not use_gpu) and ('CUDAExecutionProvider'
                          in onnxruntime.get_available_providers()):
        logger.warning(
            "Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance."
        )

    for model_name in model_names:
        all_input_names = MODELS[model_name][0]
        for num_inputs in input_counts:
            if num_inputs > len(all_input_names):
                break

            input_names = all_input_names[:num_inputs]

            if 'pt' in model_source:
                with torch.no_grad():
                    onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
                        model_name, MODELS[model_name][1],
                        MODELS[model_name][2], MODELS[model_name][3],
                        model_class, cache_dir, onnx_dir, input_names, use_gpu,
                        precision, optimize_onnx, validate_onnx,
                        use_raw_attention_mask, overwrite,
                        model_fusion_statistics)
            if 'tf' in model_source:
                onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
                    model_name, MODELS[model_name][1], MODELS[model_name][2],
                    MODELS[model_name][3], model_class, cache_dir, onnx_dir,
                    input_names, use_gpu, precision, optimize_onnx,
                    validate_onnx, use_raw_attention_mask, overwrite,
                    model_fusion_statistics)

            if not is_valid_onnx_model:
                continue

            ort_session = create_onnxruntime_session(
                onnx_model_file,
                use_gpu,
                enable_all_optimization=True,
                num_threads=num_threads,
                verbose=verbose)
            if ort_session is None:
                continue

            ort_output_names = [
                node_arg.name for node_arg in ort_session.get_outputs()
            ]
            output_buffers = {"last_state": None, "pooler": None}
            device = "cuda" if use_gpu else "cpu"
            config = AutoConfig.from_pretrained(model_name,
                                                cache_dir=cache_dir)
            max_last_state_size = numpy.prod([
                max(batch_sizes),
                max(sequence_lengths),
                max(vocab_size, config.hidden_size)
            ])
            max_pooler_size = numpy.prod(
                [max(batch_sizes), config.hidden_size])
            for batch_size in batch_sizes:
                if batch_size <= 0:
                    continue
                for sequence_length in sequence_lengths:
                    if max_sequence_length is not None and sequence_length > max_sequence_length:
                        continue

                    input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32
                    ort_inputs = create_onnxruntime_input(
                        vocab_size, batch_size, sequence_length, input_names,
                        input_value_type)

                    result_template = {
                        "engine": "onnxruntime",
                        "version": onnxruntime.__version__,
                        "device": device,
                        "optimizer": optimize_onnx,
                        "precision": precision,
                        "io_binding": not disable_ort_io_binding,
                        "model_name": model_name,
                        "inputs": num_inputs,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "datetime": str(datetime.now()),
                    }

                    logger.info(
                        "Run onnxruntime on {} with input shape {}".format(
                            model_name, [batch_size, sequence_length]))
                    if disable_ort_io_binding:
                        result = inference_ort(ort_session, ort_inputs,
                                               result_template, repeat_times,
                                               batch_size)
                    else:
                        # Get output sizes from a dummy ort run
                        ort_outputs = ort_session.run(ort_output_names,
                                                      ort_inputs)

                        data_type = numpy.longlong if 'pt' in model_source else numpy.int32
                        result = inference_ort_with_io_binding(
                            ort_session, ort_inputs, result_template,
                            repeat_times, ort_output_names, ort_outputs,
                            output_buffers, max_last_state_size,
                            max_pooler_size, batch_size, device, data_type)
                    logger.info(result)
                    results.append(result)

    return results
Ejemplo n.º 8
0
def run_onnxruntime(
    use_gpu,
    provider,
    model_names,
    model_class,
    config_modifier,
    precision,
    num_threads,
    batch_sizes,
    sequence_lengths,
    repeat_times,
    input_counts,
    optimizer_info,
    validate_onnx,
    cache_dir,
    onnx_dir,
    verbose,
    overwrite,
    disable_ort_io_binding,
    use_raw_attention_mask,
    model_fusion_statistics,
    model_source,
    args,
):
    import onnxruntime

    results = []
    if (use_gpu and
        ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
            and ("ROCMExecutionProvider"
                 not in onnxruntime.get_available_providers())):
        logger.error(
            "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
        return results

    warm_up_repeat = 0
    if provider == "tensorrt":
        optimizer_info = OptimizerInfo.NOOPT
        warm_up_repeat = 5
        if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers(
        ):
            logger.error(
                "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
            )
            return results

    if optimizer_info == OptimizerInfo.NOOPT:
        logger.warning(
            f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
        )

    for model_name in model_names:
        all_input_names = MODELS[model_name][0]
        for num_inputs in input_counts:
            if num_inputs > len(all_input_names):
                break

            input_names = all_input_names[:num_inputs]
            args.model_type = MODELS[model_name][3]
            fusion_options = FusionOptions.parse(args)

            if "pt" in model_source:
                with torch.no_grad():
                    (
                        onnx_model_file,
                        is_valid_onnx_model,
                        vocab_size,
                        max_sequence_length,
                    ) = export_onnx_model_from_pt(
                        model_name,
                        MODELS[model_name][1],
                        MODELS[model_name][2],
                        MODELS[model_name][3],
                        model_class,
                        config_modifier,
                        cache_dir,
                        onnx_dir,
                        input_names,
                        use_gpu,
                        precision,
                        optimizer_info,
                        validate_onnx,
                        use_raw_attention_mask,
                        overwrite,
                        model_fusion_statistics,
                        fusion_options,
                    )
            if "tf" in model_source:
                (
                    onnx_model_file,
                    is_valid_onnx_model,
                    vocab_size,
                    max_sequence_length,
                ) = export_onnx_model_from_tf(
                    model_name,
                    MODELS[model_name][1],
                    MODELS[model_name][2],
                    MODELS[model_name][3],
                    model_class,
                    config_modifier,
                    cache_dir,
                    onnx_dir,
                    input_names,
                    use_gpu,
                    precision,
                    optimizer_info,
                    validate_onnx,
                    use_raw_attention_mask,
                    overwrite,
                    model_fusion_statistics,
                    fusion_options,
                )

            if not is_valid_onnx_model:
                continue

            ort_session = create_onnxruntime_session(
                onnx_model_file,
                use_gpu,
                provider,
                enable_all_optimization=True,
                num_threads=num_threads,
                verbose=verbose,
            )
            if ort_session is None:
                continue

            ort_output_names = [
                node_arg.name for node_arg in ort_session.get_outputs()
            ]
            output_buffers = []
            device = "cuda" if use_gpu else "cpu"
            config = AutoConfig.from_pretrained(model_name,
                                                cache_dir=cache_dir)
            max_last_state_size = numpy.prod([
                max(batch_sizes),
                max(sequence_lengths),
                max(vocab_size, config.hidden_size),
            ])
            max_pooler_size = numpy.prod(
                [max(batch_sizes), config.hidden_size])
            for batch_size in batch_sizes:
                if batch_size <= 0:
                    continue
                for sequence_length in sequence_lengths:
                    if max_sequence_length is not None and sequence_length > max_sequence_length:
                        continue

                    input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
                    ort_inputs = create_onnxruntime_input(
                        vocab_size,
                        batch_size,
                        sequence_length,
                        input_names,
                        config,
                        input_value_type,
                    )
                    result_template = {
                        "engine": "onnxruntime",
                        "version": onnxruntime.__version__,
                        "providers": provider,
                        "device": device,
                        "optimizer": optimizer_info,
                        "precision": precision,
                        "io_binding": not disable_ort_io_binding,
                        "model_name": model_name,
                        "inputs": num_inputs,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "custom_layer_num": config_modifier.get_layer_num(),
                        "datetime": str(datetime.now()),
                    }

                    logger.info(
                        "Run onnxruntime on {} with input shape {}".format(
                            model_name, [batch_size, sequence_length]))

                    if disable_ort_io_binding:
                        result = inference_ort(
                            ort_session,
                            ort_inputs,
                            result_template,
                            repeat_times,
                            batch_size,
                            warm_up_repeat,
                        )
                    else:
                        # Get output sizes from a dummy ort run
                        ort_outputs = ort_session.run(ort_output_names,
                                                      ort_inputs)
                        output_buffer_max_sizes = [max_last_state_size]
                        for i in range(len(ort_outputs)):
                            if i == 2 and MODELS[model_name][3] == "gpt":
                                # past state output max size
                                output_buffer_max_sizes.append(max_pooler_size)
                            else:
                                output_buffer_max_sizes.append(
                                    max_last_state_size)

                        data_type = numpy.longlong if "pt" in model_source else numpy.intc
                        result = inference_ort_with_io_binding(
                            ort_session,
                            ort_inputs,
                            result_template,
                            repeat_times,
                            ort_output_names,
                            ort_outputs,
                            output_buffers,
                            output_buffer_max_sizes,
                            batch_size,
                            device,
                            data_type,
                            warm_up_repeat,
                        )
                    logger.info(result)
                    results.append(result)

    return results