Example #1
0
def test_ort(args, device):
    model_name = args.model

    onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx

    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
    precision = 'fp32' if not onnx_model_path.endswith("_fp16.onnx") else 'fp16'

    model = load_torch_model(model_name, device)

    num_threads = args.num_threads

    session = benchmark_helper.create_onnxruntime_session(onnx_model_path,
                                                          use_gpu=True,
                                                          enable_all_optimization=True,
                                                          num_threads=num_threads)
    if session is None:
        raise RuntimeError(f"Failed to create ORT session from ONNX file {onnx_model_path}")

    description = onnx_model_path
    if (os.environ.get('ORT_LONGFORMER_COMPACT_MEMORY', '0') == "1"):
        description += "[compact_memory]"

    return test_ort_latency(device, model, model_name, description, session, args.batch_sizes, args.sequence_lengths,
                            args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx,
                            args.disable_io_binding, args.verbose)
Example #2
0
def run_profile(onnx_model_path,
                use_gpu,
                thread_num,
                batch_size,
                sequence_length,
                samples=1,
                input_ids_name=None,
                segment_ids_name=None,
                input_mask_name=None,
                dummy_inputs=None):
    from benchmark_helper import create_onnxruntime_session

    session = create_onnxruntime_session(onnx_model_path,
                                         use_gpu,
                                         num_threads=thread_num,
                                         enable_profiling=True)

    if dummy_inputs is None:
        all_inputs = create_inputs(onnx_model_path, batch_size,
                                   sequence_length, samples, input_ids_name,
                                   segment_ids_name, input_mask_name)
        for inputs in all_inputs:
            _ = session.run(None, inputs)
    else:
        for i in range(samples):
            _ = session.run(None, dummy_inputs)

    profile_file = session.end_profiling()
    return profile_file
Example #3
0
def validate_onnx_model(onnx_model_path, example_inputs, example_outputs_flatten, use_gpu, fp16):
    test_session = create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=False)
    if test_session is None:
        logger.error(f"{onnx_model_path} is an invalid ONNX model")
        return False

    logger.info(f"{onnx_model_path} is a valid ONNX model")

    # Compare the inference result with PyTorch or Tensorflow
    example_ort_inputs = {k: t.cpu().numpy() for k, t in example_inputs.items()}
    example_ort_outputs = test_session.run(None, example_ort_inputs)
    if len(example_outputs_flatten) != len(example_ort_outputs):
        logger.error(
            f"Number of output tensors expected {len(example_outputs_flatten)}, got {len(example_ort_outputs)}")
        return False

    for i in range(len(example_outputs_flatten)):
        abs_diff = numpy.amax(numpy.abs(example_ort_outputs[i] - example_outputs_flatten[i].cpu().numpy()))
        if abs_diff > 1e-4:
            logger.info(f"Max absolute diff={abs_diff} for output tensor {i}")

        rtol = 5e-02 if fp16 else 1e-4
        atol = 1e-01 if fp16 else 1e-4
        if not numpy.allclose(example_ort_outputs[i], example_outputs_flatten[i].cpu(), rtol=rtol, atol=atol):
            logger.error(f"Output tensor {i} is not close: rtol={rtol}, atol={atol}")
            return False

    logger.info(f"inference result of onnxruntime is validated on {onnx_model_path}")
    return True
Example #4
0
def inference(model_path, dummy_inputs, outputs_path, use_gpu):
    environ_reset()
    environ_setting_nodes()
    environ_setting_paths(outputs_path)
    session = create_onnxruntime_session(model_path,
                                         use_gpu,
                                         enable_all_optimization=False)
    Gpt2Helper.onnxruntime_inference(session, dummy_inputs)
def test_ort(args, device) -> List[Dict[str, Any]]:
    model_name = args.model

    onnx_model_path = find_onnx_model(
        model_name) if not args.onnx else args.onnx

    optimized = onnx_model_path.endswith(
        "_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
    precision = "fp32" if not onnx_model_path.endswith(
        "_fp16.onnx") else "fp16"

    model = load_torch_model(model_name, device)

    num_threads = args.num_threads

    cuda_provider_options = {"arena_extend_strategy": "kSameAsRequested"}
    provider_options = {"CUDAExecutionProvider": cuda_provider_options}
    session = benchmark_helper.create_onnxruntime_session(
        onnx_model_path,
        use_gpu=True,
        enable_all_optimization=True,
        num_threads=num_threads,
        provider_options=provider_options,
    )
    if session is None:
        raise RuntimeError(
            f"Failed to create ORT session from ONNX file {onnx_model_path}")

    use_compact_memory = os.environ.get("ORT_LONGFORMER_COMPACT_MEMORY",
                                        "1") == "1"
    description = onnx_model_path
    if not use_compact_memory:
        description += "[non_compact_memory]"

    if args.use_half4:
        description += "[half4]" if precision == "fp16" else "[float4]"
    else:
        description += "[half2]" if precision == "fp16" else "[float4]"

    return test_ort_latency(
        device,
        model,
        model_name,
        description,
        session,
        args.batch_sizes,
        args.sequence_lengths,
        args.global_lengths,
        args.test_times,
        num_threads,
        optimized,
        precision,
        args.disable_io_binding,
        args.verbose,
        use_compact_memory,
        args.use_half4,
        args.disable_parity,
    )
Example #6
0
    def inference():
        session = benchmark_helper.create_onnxruntime_session(onnx_model_path,
                                                              use_gpu=True,
                                                              enable_all_optimization=True,
                                                              num_threads=num_threads)

        dummy_inputs: LongforerInputs = LongformerHelper.get_dummy_inputs(batch_size, sequence_length, global_length,
                                                                          device)
        ort_inputs = dummy_inputs.get_ort_inputs()
        for _ in range(test_times):
            ort_outputs = session.run(None, ort_inputs)
def test_all(args):
    # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
    device = torch.device('cuda:0')

    results = []
    for model_name in args.models:
        # Here we run an example input
        from transformers import LongformerModel
        torch_model_name_or_dir = MODELS[model_name]
        model = LongformerModel.from_pretrained(
            torch_model_name_or_dir)  # pretrained model name or directory
        model.to(device)

        # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
        optimized = False
        precision = 'fp32'
        onnx_model_path = model_name + ".onnx"
        optimized_fp32_model = model_name + "_fp32.onnx"
        optimized_fp16_model = model_name + "_fp16.onnx"
        import os.path
        if os.path.isfile(optimized_fp16_model):
            onnx_model_path = optimized_fp16_model
            optimized = True
            precision = 'fp16'
        elif os.path.isfile(optimized_fp32_model):
            onnx_model_path = optimized_fp32_model
            optimized = True

        for num_threads in args.num_threads:
            if "torch" in args.engines:
                results += test_torch(device, model, model_name,
                                      args.batch_sizes, args.sequence_lengths,
                                      args.global_lengths, args.test_times,
                                      num_threads)

            if "onnxruntime" in args.engines:
                session = benchmark_helper.create_onnxruntime_session(
                    onnx_model_path,
                    use_gpu=True,
                    enable_all_optimization=True,
                    num_threads=num_threads)
                results += test_onnxruntime(device, model, model_name, session,
                                            args.batch_sizes,
                                            args.sequence_lengths,
                                            args.global_lengths,
                                            args.test_times, num_threads,
                                            optimized, precision)
    return results
Example #8
0
def run_profile(onnx_model_path, use_gpu, basic_optimization, thread_num,
                batch_size, sequence_length, all_inputs):
    from benchmark_helper import create_onnxruntime_session

    session = create_onnxruntime_session(
        onnx_model_path,
        use_gpu,
        enable_all_optimization=not basic_optimization,
        num_threads=thread_num,
        enable_profiling=True)

    for inputs in all_inputs:
        _ = session.run(None, inputs)

    profile_file = session.end_profiling()
    return profile_file
    def inference():
        # Update Arena strategy so that we can measure the minimum memory required
        cuda_provider_options = {"arena_extend_strategy": "kSameAsRequested"}
        provider_options = {"CUDAExecutionProvider": cuda_provider_options}
        session = benchmark_helper.create_onnxruntime_session(
            onnx_model_path,
            use_gpu=True,
            enable_all_optimization=True,
            num_threads=num_threads,
            provider_options=provider_options,
        )

        dummy_inputs: LongformerInputs = LongformerHelper.get_dummy_inputs(
            batch_size, sequence_length, global_length, device)
        ort_inputs = dummy_inputs.get_ort_inputs()
        for _ in range(test_times):
            _ = session.run(None, ort_inputs)
Example #10
0
def run_onnxruntime(
    use_gpu,
    provider,
    model_names,
    model_class,
    config_modifier,
    precision,
    num_threads,
    batch_sizes,
    sequence_lengths,
    repeat_times,
    input_counts,
    optimizer_info,
    validate_onnx,
    cache_dir,
    onnx_dir,
    verbose,
    overwrite,
    disable_ort_io_binding,
    use_raw_attention_mask,
    model_fusion_statistics,
    model_source,
    args,
):
    import onnxruntime

    results = []
    if (use_gpu and
        ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
            and ("ROCMExecutionProvider"
                 not in onnxruntime.get_available_providers())):
        logger.error(
            "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
        return results

    warm_up_repeat = 0
    if provider == "tensorrt":
        optimizer_info = OptimizerInfo.NOOPT
        warm_up_repeat = 5
        if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers(
        ):
            logger.error(
                "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
            )
            return results

    if optimizer_info == OptimizerInfo.NOOPT:
        logger.warning(
            f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
        )

    for model_name in model_names:
        all_input_names = MODELS[model_name][0]
        for num_inputs in input_counts:
            if num_inputs > len(all_input_names):
                break

            input_names = all_input_names[:num_inputs]
            args.model_type = MODELS[model_name][3]
            fusion_options = FusionOptions.parse(args)

            if "pt" in model_source:
                with torch.no_grad():
                    (
                        onnx_model_file,
                        is_valid_onnx_model,
                        vocab_size,
                        max_sequence_length,
                    ) = export_onnx_model_from_pt(
                        model_name,
                        MODELS[model_name][1],
                        MODELS[model_name][2],
                        MODELS[model_name][3],
                        model_class,
                        config_modifier,
                        cache_dir,
                        onnx_dir,
                        input_names,
                        use_gpu,
                        precision,
                        optimizer_info,
                        validate_onnx,
                        use_raw_attention_mask,
                        overwrite,
                        model_fusion_statistics,
                        fusion_options,
                    )
            if "tf" in model_source:
                (
                    onnx_model_file,
                    is_valid_onnx_model,
                    vocab_size,
                    max_sequence_length,
                ) = export_onnx_model_from_tf(
                    model_name,
                    MODELS[model_name][1],
                    MODELS[model_name][2],
                    MODELS[model_name][3],
                    model_class,
                    config_modifier,
                    cache_dir,
                    onnx_dir,
                    input_names,
                    use_gpu,
                    precision,
                    optimizer_info,
                    validate_onnx,
                    use_raw_attention_mask,
                    overwrite,
                    model_fusion_statistics,
                    fusion_options,
                )

            if not is_valid_onnx_model:
                continue

            ort_session = create_onnxruntime_session(
                onnx_model_file,
                use_gpu,
                provider,
                enable_all_optimization=True,
                num_threads=num_threads,
                verbose=verbose,
            )
            if ort_session is None:
                continue

            ort_output_names = [
                node_arg.name for node_arg in ort_session.get_outputs()
            ]
            output_buffers = []
            device = "cuda" if use_gpu else "cpu"
            config = AutoConfig.from_pretrained(model_name,
                                                cache_dir=cache_dir)
            max_last_state_size = numpy.prod([
                max(batch_sizes),
                max(sequence_lengths),
                max(vocab_size, config.hidden_size),
            ])
            max_pooler_size = numpy.prod(
                [max(batch_sizes), config.hidden_size])
            for batch_size in batch_sizes:
                if batch_size <= 0:
                    continue
                for sequence_length in sequence_lengths:
                    if max_sequence_length is not None and sequence_length > max_sequence_length:
                        continue

                    input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
                    ort_inputs = create_onnxruntime_input(
                        vocab_size,
                        batch_size,
                        sequence_length,
                        input_names,
                        config,
                        input_value_type,
                    )
                    result_template = {
                        "engine": "onnxruntime",
                        "version": onnxruntime.__version__,
                        "providers": provider,
                        "device": device,
                        "optimizer": optimizer_info,
                        "precision": precision,
                        "io_binding": not disable_ort_io_binding,
                        "model_name": model_name,
                        "inputs": num_inputs,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "custom_layer_num": config_modifier.get_layer_num(),
                        "datetime": str(datetime.now()),
                    }

                    logger.info(
                        "Run onnxruntime on {} with input shape {}".format(
                            model_name, [batch_size, sequence_length]))

                    if disable_ort_io_binding:
                        result = inference_ort(
                            ort_session,
                            ort_inputs,
                            result_template,
                            repeat_times,
                            batch_size,
                            warm_up_repeat,
                        )
                    else:
                        # Get output sizes from a dummy ort run
                        ort_outputs = ort_session.run(ort_output_names,
                                                      ort_inputs)
                        output_buffer_max_sizes = [max_last_state_size]
                        for i in range(len(ort_outputs)):
                            if i == 2 and MODELS[model_name][3] == "gpt":
                                # past state output max size
                                output_buffer_max_sizes.append(max_pooler_size)
                            else:
                                output_buffer_max_sizes.append(
                                    max_last_state_size)

                        data_type = numpy.longlong if "pt" in model_source else numpy.intc
                        result = inference_ort_with_io_binding(
                            ort_session,
                            ort_inputs,
                            result_template,
                            repeat_times,
                            ort_output_names,
                            ort_outputs,
                            output_buffers,
                            output_buffer_max_sizes,
                            batch_size,
                            device,
                            data_type,
                            warm_up_repeat,
                        )
                    logger.info(result)
                    results.append(result)

    return results
Example #11
0
def main(args):
    from transformers import __version__ as transformers_version

    if version.parse(transformers_version) < version.parse(
            "3.1.0"):  # past_key_values name does not exist in 3.0.2 or older
        raise RuntimeError("This tool requires transformers 3.1.0 or later.")

    logger.info(f"Arguments:{args}")
    if args.precision == Precision.FLOAT16:
        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    torch.set_num_threads(
        psutil.cpu_count(
            logical=True) if args.thread_num <= 0 else args.thread_num)
    print(torch.__config__.parallel_info())

    cache_dir = args.cache_dir
    output_dir = args.onnx_dir
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    model_class = MODEL_CLASSES[args.model_class][0]
    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
        model_type = "beam_search_step"
    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
        model_type = "configurable_one_step_search"
    else:
        model_type = "default"

    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        torchscript=args.torchscript,
                                        cache_dir=cache_dir)
    if model_type == "beam_search_step":
        model = model_class.from_pretrained(
            args.model_name_or_path,
            config=config,
            batch_size=1,
            beam_size=args.beam_size,
            cache_dir=cache_dir,
        )
    elif model_type == "configurable_one_step_search":
        model = model_class.from_pretrained(
            args.model_name_or_path,
            config=config,
            batch_size=1,
            beam_size=args.beam_size,
            ignore_eos=args.ignore_eos,
            temperature=args.temperature,
            repetition_penalty=args.repetition_penalty,
            excluded_token_ids=args.excluded_token_ids,
            length_penalty=args.length_penalty,
            do_sample=args.do_sample,
            do_sample_top_p=args.do_sample_top_p,
            do_sample_top_k=args.do_sample_top_k,
            cache_dir=cache_dir,
        )
    else:
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            cache_dir=cache_dir)

    # This scirpt does not support float16 for PyTorch.
    # if args.float16:
    #    model.half()

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.to(device)
    use_external_data_format = config.n_layer > 24  # TODO: find a way to check model size > 2GB
    onnx_model_paths = gpt2helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        has_past=True,
        new_folder=use_external_data_format,
    )

    onnx_model_path = onnx_model_paths["raw"]
    use_padding = MODEL_CLASSES[args.model_class][2]
    gpt2helper.export_onnx(
        model,
        device,
        onnx_model_path,
        args.verbose,
        use_external_data_format,
        has_position_ids=use_padding,
        has_attention_mask=use_padding,
    )

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        onnx_model_path = onnx_model_paths[str(
            args.precision) if args.precision != Precision.INT8 else "fp32"]
        gpt2helper.optimize_onnx(
            onnx_model_paths["raw"],
            onnx_model_path,
            args.precision == Precision.FLOAT16,
            model.config.num_attention_heads,
            model.config.hidden_size,
            use_external_data_format,
            auto_mixed_precision=True,
        )

        if args.precision == Precision.INT8:
            logger.info("quantizing model...")
            QuantizeHelper.quantize_onnx_model(onnx_model_path,
                                               onnx_model_paths["int8"],
                                               use_external_data_format)
            model = QuantizeHelper.quantize_torch_model(model)
            logger.info("finished quantizing model")
            onnx_model_path = onnx_model_paths["int8"]

    if args.torchscript:
        model = gpt2helper.torchscript(
            model,
            config,
            device,
            has_position_ids=use_padding,
            has_attention_mask=use_padding,
        )

    session = create_onnxruntime_session(
        onnx_model_path,
        args.use_gpu,
        enable_all_optimization=False,
        num_threads=args.thread_num,
        verbose=args.verbose,
    )
    if session is None:
        return

    # Allocate output buffers for IO Binding
    if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
        max_output_shapes = gpt2helper.get_output_shapes(
            max(args.batch_sizes),
            context_len=max(args.past_sequence_lengths),
            past_sequence_length=max(args.past_sequence_lengths),
            sequence_length=max(args.sequence_lengths),
            beam_size=args.beam_size,
            step=0,
            config=config,
            model_class=args.model_class,
        )

        output_buffers = gpt2helper.get_output_buffers(
            max_output_shapes, device, args.precision == Precision.FLOAT16)

    else:
        max_output_shapes = gpt2helper.get_output_shapes(
            max(args.batch_sizes),
            max(args.past_sequence_lengths),
            max(args.sequence_lengths),
            config,
            args.model_class,
        )
        output_buffers = gpt2helper.get_output_buffers(
            max_output_shapes, device, args.precision == Precision.FLOAT16)

    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(
        datetime.now().strftime("%Y%m%d-%H%M%S"))
    with open(csv_filename, mode="a", newline="") as csv_file:
        column_names = [
            "model_name",
            "model_class",
            "gpu",
            "precision",
            "optimizer",
            "torchscript",
            "batch_size",
            "sequence_length",
            "past_sequence_length",
            "torch_latency",
            "onnxruntime_latency",
            "onnxruntime_io_binding_latency",
        ]
        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
        csv_writer.writeheader()

        for batch_size in args.batch_sizes:
            for sequence_length in args.sequence_lengths:
                for past_sequence_length in args.past_sequence_lengths:
                    assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
                    logger.debug(
                        f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..."
                    )
                    if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
                        dummy_inputs = gpt2helper.get_dummy_inputs(
                            batch_size,
                            past_sequence_length,
                            sequence_length,
                            config.num_attention_heads,
                            config.hidden_size,
                            config.n_layer,
                            config.vocab_size,
                            device,
                            float16=(args.precision == Precision.FLOAT16),
                            has_position_ids=use_padding,
                            has_attention_mask=use_padding,
                        )
                        output_shapes = gpt2helper.get_output_shapes(
                            batch_size,
                            past_sequence_length,
                            past_sequence_length,
                            sequence_length,
                            args.beam_size,
                            0,
                            config,
                            args.model_class,
                        )
                    else:
                        dummy_inputs = gpt2helper.get_dummy_inputs(
                            batch_size,
                            past_sequence_length,
                            sequence_length,
                            config.num_attention_heads,
                            config.hidden_size,
                            config.n_layer,
                            config.vocab_size,
                            device,
                            float16=(args.precision == Precision.FLOAT16),
                            has_position_ids=use_padding,
                            has_attention_mask=use_padding,
                        )
                        output_shapes = gpt2helper.get_output_shapes(
                            batch_size,
                            past_sequence_length,
                            sequence_length,
                            config,
                            args.model_class,
                        )

                    try:
                        outputs, torch_latency = gpt2helper.pytorch_inference(
                            model, dummy_inputs, args.test_times)

                        # Dump Torch output shape
                        for i, value in enumerate(outputs):
                            if isinstance(value, tuple):
                                logger.debug(
                                    f"torch output {i} is tuple of size {len(value)}, shape {value[0].shape}"
                                )
                            else:
                                logger.debug(
                                    f"torch output {i} shape {value.shape}")

                        ort_outputs, ort_latency = gpt2helper.onnxruntime_inference(
                            session, dummy_inputs, args.test_times)

                        (
                            ort_io_outputs,
                            ort_io_latency,
                        ) = gpt2helper.onnxruntime_inference_with_binded_io(
                            session,
                            dummy_inputs,
                            output_buffers,
                            output_shapes,
                            args.test_times,
                            return_numpy=False,
                            include_copy_output_latency=args.
                            include_copy_output_latency,
                        )

                        if args.validate_onnx:
                            if gpt2helper.compare_outputs(
                                    outputs,
                                    ort_outputs,
                                    model_class=args.model_class,
                                    rtol=DEFAULT_TOLERANCE[args.precision],
                                    atol=DEFAULT_TOLERANCE[args.precision],
                            ):
                                logger.info(
                                    f"Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]})."
                                )

                            # Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
                            copy_outputs = []
                            for output in ort_io_outputs:
                                copy_outputs.append(output.cpu().numpy())

                            if gpt2helper.compare_outputs(
                                    outputs,
                                    copy_outputs,
                                    model_class=args.model_class,
                                    rtol=DEFAULT_TOLERANCE[args.precision],
                                    atol=DEFAULT_TOLERANCE[args.precision],
                            ):
                                logger.info(
                                    f"Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]})."
                                )

                        logger.info(
                            f"batch_size={batch_size}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, onnxruntime_latency={ort_latency:.2f}, onnxruntime_io_binding_latency={ort_io_latency:.2f}"
                        )

                        row = {
                            "model_name":
                            args.model_name_or_path,
                            "model_class":
                            args.model_class,
                            "gpu":
                            args.use_gpu,
                            "precision":
                            args.precision,
                            "optimizer":
                            args.optimize_onnx,
                            "torchscript":
                            args.torchscript,
                            "batch_size":
                            batch_size,
                            "sequence_length":
                            sequence_length,
                            "past_sequence_length":
                            past_sequence_length,
                            "torch_latency":
                            f"{torch_latency:.2f}",
                            "onnxruntime_latency":
                            f"{ort_latency:.2f}",
                            "onnxruntime_io_binding_latency":
                            f"{ort_io_latency:.2f}",
                        }
                        csv_writer.writerow(row)
                    except:
                        logger.error(f"Exception", exc_info=True)
                        return None

    logger.info(f"Results are saved to file {csv_filename}")
    return csv_filename
Example #12
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class)

    logger.info(f"Done. Output model: {output_path}")
Example #13
0
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads,
                    batch_sizes, sequence_lengths, repeat_times, input_counts,
                    optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose,
                    overwrite, disable_ort_io_binding, use_raw_attention_mask,
                    model_fusion_statistics, model_source):
    import onnxruntime

    results = []
    if use_gpu and ('CUDAExecutionProvider'
                    not in onnxruntime.get_available_providers()):
        logger.error(
            "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
        return results

    if (not use_gpu) and ('CUDAExecutionProvider'
                          in onnxruntime.get_available_providers()):
        logger.warning(
            "Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance."
        )

    for model_name in model_names:
        all_input_names = MODELS[model_name][0]
        for num_inputs in input_counts:
            if num_inputs > len(all_input_names):
                break

            input_names = all_input_names[:num_inputs]

            if 'pt' in model_source:
                with torch.no_grad():
                    onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
                        model_name, MODELS[model_name][1],
                        MODELS[model_name][2], MODELS[model_name][3],
                        model_class, cache_dir, onnx_dir, input_names, use_gpu,
                        precision, optimize_onnx, validate_onnx,
                        use_raw_attention_mask, overwrite,
                        model_fusion_statistics)
            if 'tf' in model_source:
                onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
                    model_name, MODELS[model_name][1], MODELS[model_name][2],
                    MODELS[model_name][3], model_class, cache_dir, onnx_dir,
                    input_names, use_gpu, precision, optimize_onnx,
                    validate_onnx, use_raw_attention_mask, overwrite,
                    model_fusion_statistics)

            if not is_valid_onnx_model:
                continue

            ort_session = create_onnxruntime_session(
                onnx_model_file,
                use_gpu,
                enable_all_optimization=True,
                num_threads=num_threads,
                verbose=verbose)
            if ort_session is None:
                continue

            ort_output_names = [
                node_arg.name for node_arg in ort_session.get_outputs()
            ]
            output_buffers = {"last_state": None, "pooler": None}
            device = "cuda" if use_gpu else "cpu"
            config = AutoConfig.from_pretrained(model_name,
                                                cache_dir=cache_dir)
            max_last_state_size = numpy.prod([
                max(batch_sizes),
                max(sequence_lengths),
                max(vocab_size, config.hidden_size)
            ])
            max_pooler_size = numpy.prod(
                [max(batch_sizes), config.hidden_size])
            for batch_size in batch_sizes:
                if batch_size <= 0:
                    continue
                for sequence_length in sequence_lengths:
                    if max_sequence_length is not None and sequence_length > max_sequence_length:
                        continue

                    input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32
                    ort_inputs = create_onnxruntime_input(
                        vocab_size, batch_size, sequence_length, input_names,
                        input_value_type)

                    result_template = {
                        "engine": "onnxruntime",
                        "version": onnxruntime.__version__,
                        "device": device,
                        "optimizer": optimize_onnx,
                        "precision": precision,
                        "io_binding": not disable_ort_io_binding,
                        "model_name": model_name,
                        "inputs": num_inputs,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "datetime": str(datetime.now()),
                    }

                    logger.info(
                        "Run onnxruntime on {} with input shape {}".format(
                            model_name, [batch_size, sequence_length]))
                    if disable_ort_io_binding:
                        result = inference_ort(ort_session, ort_inputs,
                                               result_template, repeat_times,
                                               batch_size)
                    else:
                        # Get output sizes from a dummy ort run
                        ort_outputs = ort_session.run(ort_output_names,
                                                      ort_inputs)

                        data_type = numpy.longlong if 'pt' in model_source else numpy.int32
                        result = inference_ort_with_io_binding(
                            ort_session, ort_inputs, result_template,
                            repeat_times, ort_output_names, ort_outputs,
                            output_buffers, max_last_state_size,
                            max_pooler_size, batch_size, device, data_type)
                    logger.info(result)
                    results.append(result)

    return results
Example #14
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    use_external_data_format = (config.n_layer > 24
                                )  #TODO: find a way to check model size > 2GB
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        new_folder=use_external_data_format)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    logger.info(f"Exporting ONNX model to {raw_onnx_model}")
    use_padding = MODEL_CLASSES[args.model_class][2]
    Gpt2Helper.export_onnx(model,
                           device,
                           raw_onnx_model,
                           args.verbose,
                           use_external_data_format,
                           has_position_ids=use_padding,
                           has_attention_mask=use_padding)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        logger.info(f"Optimizing model to {output_path}")
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=True,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class,
                               has_position_ids=use_padding,
                               has_attention_mask=use_padding)

    if args.input_test_file:
        test_inputs = []
        # Each line of test file is a JSON string like:
        # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
        with open(args.input_test_file) as read_f:
            for i, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)

                if use_padding:
                    if "attention_mask" in data:
                        numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                        attention_mask = torch.from_numpy(
                            numpy.asarray(data["attention_mask"],
                                          dtype=numpy_float)).to(device)
                    else:
                        padding = -1
                        attention_mask = (
                            input_ids !=
                            padding).type(torch.float16 if args.precision ==
                                          Precision.FLOAT16 else torch.float32)
                        input_ids.masked_fill_(input_ids == padding, 0)

                    if "position_ids" in data:
                        position_ids = torch.from_numpy(
                            numpy.asarray(data["position_ids"],
                                          dtype=numpy.int64)).to(device)
                    else:
                        position_ids = (attention_mask.long().cumsum(-1) - 1)
                        position_ids.masked_fill_(position_ids < 0, 0)

                    inputs = {
                        "input_ids": input_ids,
                        "position_ids": position_ids,
                        "attention_mask": attention_mask
                    }
                else:
                    inputs = {"input_ids": input_ids}

                test_inputs.append(inputs)

        Gpt2Tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose)

    logger.info(f"Done. Output model: {output_path}")
Example #15
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    logger.info(f"Arguments:{args}")
    if args.precision == Precision.FLOAT16:
        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    torch.set_num_threads(
        psutil.cpu_count(
            logical=True) if args.thread_num <= 0 else args.thread_num)
    print(torch.__config__.parallel_info())

    cache_dir = args.cache_dir
    output_dir = args.onnx_dir
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    model_class = MODEL_CLASSES[args.model_class][0]

    config = AutoConfig.from_pretrained(args.model_name,
                                        torchscript=args.torchscript,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name,
                                        config=config,
                                        cache_dir=cache_dir)

    # This scirpt does not support float16 for PyTorch.
    #if args.float16:
    #    model.half()

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name,
                                                 args.model_class)

    onnx_model_path = onnx_model_paths["raw"]
    Gpt2Helper.export_onnx(model, device, onnx_model_path, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        onnx_model_path = onnx_model_paths[str(args.precision)]
        Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

        if args.precision == Precision.INT8:
            logger.info("quantizing model...")
            QuantizeHelper.quantize_onnx_model(onnx_model_path,
                                               onnx_model_path)
            model = QuantizeHelper.quantize_torch_model(model)
            logger.info("finished quantizing model")

    if args.torchscript:
        model = Gpt2Helper.torchscript(model, config, device)

    session = create_onnxruntime_session(onnx_model_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         num_threads=args.thread_num,
                                         verbose=args.verbose)
    if session is None:
        return

    # One word is generated for each inference. This length does not include that of past state.
    sequence_length = 1

    # Allocate output buffers for IO Binding
    max_output_shapes = Gpt2Helper.get_output_shapes(
        max(args.batch_sizes), max(args.past_sequence_lengths),
        sequence_length, config, args.model_class)
    output_buffers = Gpt2Helper.get_output_buffers(
        max_output_shapes, device, args.precision == Precision.FLOAT16)

    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(
        datetime.now().strftime("%Y%m%d-%H%M%S"))
    with open(csv_filename, mode="a", newline='') as csv_file:
        column_names = [
            "model_name", "model_class", "gpu", "precision", "optimizer",
            "torchscript", "batch_size", "past_sequence_length",
            "torch_latency", "ort_latency", "ort_io_latency"
        ]
        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
        csv_writer.writeheader()

        for batch_size in args.batch_sizes:
            for past_sequence_length in args.past_sequence_lengths:
                logger.debug(
                    f"Running test for batch_size={batch_size} past_sequence_length={past_sequence_length}..."
                )
                dummy_inputs = Gpt2Helper.get_dummy_inputs(
                    batch_size, past_sequence_length, sequence_length,
                    config.num_attention_heads, config.hidden_size,
                    config.n_layer, config.vocab_size, device,
                    args.precision == Precision.FLOAT16)
                output_shapes = Gpt2Helper.get_output_shapes(
                    batch_size, past_sequence_length, sequence_length, config,
                    args.model_class)

                try:
                    outputs, torch_latency = Gpt2Helper.pytorch_inference(
                        model, dummy_inputs, args.test_times)
                    ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference(
                        session, dummy_inputs, args.test_times)
                    ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io(
                        session, dummy_inputs, output_buffers, output_shapes,
                        args.test_times)
                    if args.validate_onnx:
                        if Gpt2Helper.compare_outputs(
                                outputs,
                                ort_outputs,
                                rtol=DEFAULT_TOLERANCE[args.precision],
                                atol=DEFAULT_TOLERANCE[args.precision]):
                            logger.info(
                                f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                            )
                        if Gpt2Helper.compare_outputs(
                                outputs,
                                ort_io_outputs,
                                rtol=DEFAULT_TOLERANCE[args.precision],
                                atol=DEFAULT_TOLERANCE[args.precision]):
                            logger.info(
                                f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                            )

                    logger.info(
                        f"batch_size={batch_size}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, ort_latency={ort_latency:.2f}, ort_io_latency={ort_io_latency:.2f}"
                    )

                    row = {
                        "model_name": args.model_name,
                        "model_class": args.model_class,
                        "gpu": args.use_gpu,
                        "precision": args.precision,
                        "optimizer": args.optimize_onnx,
                        "torchscript": args.torchscript,
                        "batch_size": batch_size,
                        "past_sequence_length": past_sequence_length,
                        "torch_latency": f"{torch_latency:.2f}",
                        "ort_latency": f"{ort_latency:.2f}",
                        "ort_io_latency": f"{ort_io_latency:.2f}"
                    }
                    csv_writer.writerow(row)
                except:
                    logger.error(f"Exception", exc_info=True)

    logger.info(f"Results are saved to file {csv_filename}")
Example #16
0
def main(argv=None,
         experiment_name="",
         run_id=0,
         csv_filename="gpt2_parity_results.csv"):
    result = {}
    from transformers import __version__ as transformers_version
    if version.parse(transformers_version) < version.parse(
            "3.1.0"):  # past_key_values name does not exist in 3.0.2 or older
        raise RuntimeError("This tool requires transformers 3.1.0 or later.")

    args = parse_arguments(argv)
    setup_logger(args.verbose)

    if not experiment_name:
        import sys
        experiment_name = " ".join(argv if argv else sys.argv[1:])

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    if args.use_external_data_format:
        assert not args.output.endswith(
            '.onnx'
        ), "output shall be a directory for --use_external_data_format"

    model_class = MODEL_CLASSES[args.model_class][0]
    use_padding = MODEL_CLASSES[args.model_class][2]

    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
        model_type = "beam_search_step"
    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
        model_type = "configurable_one_step_search"
    else:
        model_type = "default"

    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
    gpt2tester = Gpt2TesterFactory.create_tester(model_type)
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if model_type == 'beam_search_step':
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            batch_size=1,
                                            beam_size=args.beam_size,
                                            cache_dir=cache_dir)
    elif model_type == 'configurable_one_step_search':
        model = model_class.from_pretrained(
            args.model_name_or_path,
            config=config,
            batch_size=1,
            beam_size=args.beam_size,
            ignore_eos=args.ignore_eos,
            temperature=args.temperature,
            repetition_penalty=args.repetition_penalty,
            excluded_token_ids=args.excluded_token_ids,
            length_penalty=args.length_penalty,
            do_sample=args.do_sample,
            do_sample_top_p=args.do_sample_top_p,
            do_sample_top_k=args.do_sample_top_k,
            cache_dir=cache_dir)
    else:
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    if (not args.use_external_data_format) and (config.n_layer > 24):
        logger.info(f"Try --use_external_data_format when model size > 2GB")

    onnx_model_paths = gpt2helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        new_folder=args.use_external_data_format,
        remove_existing=[
            "fp32", "fp16", "int8"
        ])  # Do not remove raw model to save time in parity test

    raw_onnx_model = onnx_model_paths["raw"]

    if os.path.exists(raw_onnx_model):
        logger.warning(
            f"Skip exporting ONNX model since it existed: {raw_onnx_model}")
    else:
        logger.info(f"Exporting ONNX model to {raw_onnx_model}")
        gpt2helper.export_onnx(model,
                               device,
                               raw_onnx_model,
                               args.verbose,
                               args.use_external_data_format,
                               has_position_ids=use_padding,
                               has_attention_mask=use_padding,
                               input_ids_dtype=torch.int32
                               if args.use_int32_inputs else torch.int64,
                               position_ids_dtype=torch.int32
                               if args.use_int32_inputs else torch.int64,
                               attention_mask_dtype=torch.int32
                               if args.use_int32_inputs else torch.int64)

    fp16_params = {"keep_io_types": args.keep_io_types}
    if args.io_block_list:
        fp16_params["keep_io_types"] = args.io_block_list
    if args.node_block_list:
        fp16_params["node_block_list"] = args.node_block_list
    if args.op_block_list:
        fp16_params["op_block_list"] = args.op_block_list
    if args.force_fp16_initializers:
        fp16_params["force_fp16_initializers"] = args.force_fp16_initializers

    is_io_float16 = (args.precision == Precision.FLOAT16
                     and not args.keep_io_types)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        output_path = onnx_model_paths[str(args.precision) if args.
                                       precision != Precision.INT8 else 'fp32']

        logger.info(f"Optimizing model to {output_path}")
        gpt2helper.optimize_onnx(
            raw_onnx_model,
            output_path,
            args.precision == Precision.FLOAT16,
            model.config.num_attention_heads,
            model.config.hidden_size,
            args.use_external_data_format,
            auto_mixed_precision=args.auto_mixed_precision,
            **fp16_params)
    else:
        output_path = raw_onnx_model

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path,
                                           onnx_model_paths['int8'],
                                           args.use_external_data_format)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")
        output_path = onnx_model_paths['int8']

    if args.output.endswith(
            '.onnx'
    ) and output_path != args.output and not args.use_external_data_format:
        import shutil
        shutil.move(output_path, args.output)
        output_path = args.output

    logger.info(f"Output path: {output_path}")
    model_size_in_MB = int(
        get_onnx_model_size(output_path, args.use_external_data_format) /
        1024 / 1024)

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=True,
                                         verbose=args.verbose)
    if args.model_class == "GPT2LMHeadModel" and session is not None:
        parity_result = gpt2helper.test_parity(
            session,
            model,
            device,
            is_io_float16,
            rtol=args.tolerance,
            atol=args.tolerance,
            model_class=args.model_class,
            has_position_ids=use_padding,
            has_attention_mask=use_padding,
            input_ids_dtype=torch.int32
            if args.use_int32_inputs else torch.int64,
            position_ids_dtype=torch.int32
            if args.use_int32_inputs else torch.int64,
            attention_mask_dtype=torch.int32
            if args.use_int32_inputs else torch.int64,
            test_cases_per_run=args.test_cases,
            total_runs=args.test_runs,
            verbose=args.verbose)

        latency = gpt2helper.test_performance(
            session,
            model,
            device,
            is_io_float16,
            total_runs=100,
            use_io_binding=True,
            model_class=args.model_class,
            has_position_ids=use_padding,
            has_attention_mask=use_padding,
            input_ids_dtype=torch.int32
            if args.use_int32_inputs else torch.int64,
            position_ids_dtype=torch.int32
            if args.use_int32_inputs else torch.int64,
            attention_mask_dtype=torch.int32
            if args.use_int32_inputs else torch.int64,
            batch_size=8,
            sequence_length=1,
            past_sequence_length=32)

        if args.precision == Precision.FLOAT16:
            logger.info(f"fp16 conversion parameters:{fp16_params}")

        # Write results to file
        import csv
        from onnxruntime import __version__ as ort_version
        latency_name = get_latency_name()
        csv_file_existed = os.path.exists(csv_filename)
        with open(csv_filename, mode="a", newline='') as csv_file:
            column_names = [
                "experiment", "run_id", "model_name", "model_class", "gpu",
                "precision", "optimizer", "test_cases", "runs",
                "keep_io_types", "io_block_list", "op_block_list",
                "node_block_list", "force_fp16_initializers",
                "auto_mixed_precision", "ORT_TRANSFORMER_OPTIONS",
                "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name,
                "top1_match_rate", "onnx_size_in_MB", "diff_50_percentile",
                "diff_90_percentile", "diff_95_percentile",
                "diff_99_percentile", "diff_pass_rate", "nan_rate",
                "top1_match_rate_per_run"
            ]
            csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
            if not csv_file_existed:
                csv_writer.writeheader()
            row = {
                "experiment": experiment_name,
                "run_id": run_id,
                "model_name": args.model_name_or_path,
                "model_class": args.model_class,
                "gpu": args.use_gpu,
                "precision": args.precision,
                "optimizer": args.optimize_onnx,
                "test_cases": args.test_cases,
                "runs": args.test_runs,
                "keep_io_types": args.keep_io_types,
                "io_block_list": args.io_block_list,
                "op_block_list": args.op_block_list,
                "node_block_list": args.node_block_list,
                "force_fp16_initializers": args.force_fp16_initializers,
                "auto_mixed_precision": args.auto_mixed_precision,
                "ORT_TRANSFORMER_OPTIONS":
                os.getenv('ORT_TRANSFORMER_OPTIONS'),
                "ORT_CUDA_GEMM_OPTIONS": os.getenv('ORT_CUDA_GEMM_OPTIONS'),
                "onnxruntime": ort_version,
                latency_name: f"{latency:.2f}",
                "diff_50_percentile": parity_result["max_diff_percentile_50"],
                "diff_90_percentile": parity_result["max_diff_percentile_90"],
                "diff_95_percentile": parity_result["max_diff_percentile_95"],
                "diff_99_percentile": parity_result["max_diff_percentile_99"],
                "diff_pass_rate": parity_result["diff_pass_rate"],
                "nan_rate": parity_result["nan_rate"],
                "top1_match_rate": parity_result["top1_match_rate"],
                "top1_match_rate_per_run":
                parity_result["top1_match_rate_per_run"],
                "onnx_size_in_MB": "{}".format(model_size_in_MB),
            }
            logger.info(f"result: {row}")
            result.update(row)
            csv_writer.writerow(row)

    if args.input_test_file:
        test_inputs = []
        # Each line of test file is a JSON string like:
        # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
        with open(args.input_test_file) as read_f:
            for _, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)

                if use_padding:
                    if "attention_mask" in data:
                        numpy_float = numpy.float16 if is_io_float16 else numpy.float32
                        attention_mask = torch.from_numpy(
                            numpy.asarray(data["attention_mask"],
                                          dtype=numpy_float)).to(device)
                    else:
                        padding = -1
                        attention_mask = (input_ids != padding).type(
                            torch.float16 if is_io_float16 else torch.float32)
                        input_ids.masked_fill_(input_ids == padding, 0)

                    if "position_ids" in data:
                        position_ids = torch.from_numpy(
                            numpy.asarray(data["position_ids"],
                                          dtype=numpy.int64)).to(device)
                    else:
                        position_ids = (attention_mask.long().cumsum(-1) - 1)
                        position_ids.masked_fill_(position_ids < 0, 0)

                    inputs = {
                        "input_ids":
                        input_ids.to(torch.int32)
                        if args.use_int32_inputs else input_ids,
                        "position_ids":
                        position_ids.to(torch.int32)
                        if args.use_int32_inputs else position_ids,
                        "attention_mask":
                        attention_mask.to(torch.int32)
                        if args.use_int32_inputs else attention_mask
                    }
                else:
                    inputs = {
                        "input_ids":
                        input_ids.to(torch.int32)
                        if args.use_int32_inputs else input_ids
                    }

                if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
                    beam_select_idx = torch.zeros([1,
                                                   input_ids.shape[0]]).long()

                    input_log_probs = torch.zeros([input_ids.shape[0], 1])
                    input_unfinished_sents = torch.ones(
                        [input_ids.shape[0], 1], dtype=torch.bool)
                    inputs.update({
                        "beam_select_idx":
                        beam_select_idx,
                        "input_log_probs":
                        input_log_probs,
                        "input_unfinished_sents":
                        input_unfinished_sents,
                    })

                test_inputs.append(inputs)

        gpt2tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose,
                                   save_test_data=3,
                                   save_test_data_dir=Path(output_path).parent)

    logger.info(f"Done. Output model: {output_path}")
    return result
Example #17
0
def test_all(args):
    # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
    device = torch.device('cuda:0')

    results = []
    for model_name in args.models:
        # Here we run an example input
        from transformers import LongformerModel
        torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name]
        model = LongformerModel.from_pretrained(
            torch_model_name_or_dir)  # pretrained model name or directory
        model.to(device)

        # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
        # TODO: call convert_longformer_to_onnx to export onnx instead.
        import os.path
        optimized = False
        precision = 'fp32'
        onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx")
        optimized_fp32_model = os.path.join(args.onnx_dir,
                                            model_name + "_fp32.onnx")
        optimized_fp16_model = os.path.join(args.onnx_dir,
                                            model_name + "_fp16.onnx")
        if os.path.isfile(optimized_fp16_model):
            onnx_model_path = optimized_fp16_model
            optimized = True
            precision = 'fp16'
        elif os.path.isfile(optimized_fp32_model):
            onnx_model_path = optimized_fp32_model
            optimized = True
        print("ONNX model path:", onnx_model_path)

        for num_threads in args.num_threads:
            if "torch" in args.engines:
                results += test_torch_latency(device, model, model_name,
                                              args.batch_sizes,
                                              args.sequence_lengths,
                                              args.global_lengths,
                                              args.test_times, num_threads,
                                              args.verbose)

            if "onnxruntime" in args.engines:
                if args.memory:
                    test_ort_memory(device, onnx_model_path,
                                    args.batch_sizes[0],
                                    args.sequence_lengths[0],
                                    args.global_lengths[0], args.test_times,
                                    num_threads)
                else:  # test latency
                    session = benchmark_helper.create_onnxruntime_session(
                        onnx_model_path,
                        use_gpu=True,
                        enable_all_optimization=True,
                        num_threads=num_threads)
                    if session is None:
                        raise RuntimeError(
                            f"Failed to create ORT sesssion from ONNX file {onnx_model_path}"
                        )

                    results += test_ort_latency(
                        device, model, model_name, session, args.batch_sizes,
                        args.sequence_lengths, args.global_lengths,
                        args.test_times, num_threads, optimized, precision,
                        args.validate_onnx, args.disable_io_binding,
                        args.verbose)
    return results
Example #18
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class)

    if args.input_test_file:
        test_inputs = []
        with open(args.input_test_file) as read_f:
            for i, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)
                position_ids = torch.from_numpy(
                    numpy.asarray(data["position_ids"],
                                  dtype=numpy.int64)).to(device)
                numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                attention_mask = torch.from_numpy(
                    numpy.asarray(data["attention_mask"],
                                  dtype=numpy_float)).to(device)
                inputs = {
                    "input_ids": input_ids,
                    "position_ids": position_ids,
                    "attention_mask": attention_mask
                }
                test_inputs.append(inputs)
        Gpt2Tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose)

    logger.info(f"Done. Output model: {output_path}")
Example #19
0
def export_onnx_models(model_name_or_path,
                       cache_dir,
                       output_dir,
                       use_gpu,
                       use_external_data_format,
                       optimize_onnx,
                       precision,
                       verbose,
                       use_decoder_start_token: bool = True,
                       merge_encoder_and_decoder_init: bool = True,
                       overwrite: bool = False):
    device = torch.device("cuda:0" if use_gpu else "cpu")

    models = T5Helper.load_model(model_name_or_path, cache_dir, device, merge_encoder_and_decoder_init)
    config = models["decoder"].config

    if (not use_external_data_format) and (config.num_layers > 24):
        logger.info(f"Try use_external_data_format when model size > 2GB")

    output_paths = []
    for name, model in models.items():
        filename_suffix = "_" + name

        onnx_path = T5Helper.get_onnx_path(output_dir,
                                           model_name_or_path,
                                           suffix=filename_suffix,
                                           new_folder=use_external_data_format)

        if overwrite or not os.path.exists(onnx_path):
            logger.info(f"Exporting ONNX model to {onnx_path}")
            # We have to clone model before exporting onnx, otherwise verify_onnx will report large difference.
            T5Helper.export_onnx(copy.deepcopy(model),
                                 device,
                                 onnx_path,
                                 verbose,
                                 use_external_data_format,
                                 use_decoder_input_ids=not use_decoder_start_token)
        else:
            logger.info(f"Skip exporting: existed ONNX model {onnx_path}")

        # Optimize ONNX graph. Note that we have not implemented graph optimization for T5 yet.
        if optimize_onnx or precision != Precision.FLOAT32:
            output_path = T5Helper.get_onnx_path(output_dir,
                                                 model_name_or_path,
                                                 suffix=filename_suffix + "_" + str(precision),
                                                 new_folder=use_external_data_format)

            if overwrite or not os.path.exists(output_path):
                logger.info(f"Optimizing model to {output_path}")
                T5Helper.optimize_onnx(onnx_path, output_path, precision == Precision.FLOAT16, config.num_heads,
                                       config.hidden_size, use_external_data_format)
            else:
                logger.info(f"Skip optimizing: existed ONNX model {onnx_path}")
        else:
            output_path = onnx_path

        ort_session = create_onnxruntime_session(
            output_path,
            use_gpu=use_gpu,
            provider=['CUDAExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider'])
        max_diff = T5Helper.verify_onnx(model, ort_session, device)
        logger.info(f'PyTorch and OnnxRuntime results max difference = {max_diff}')
        if max_diff > 1e-4:
            logger.warn(f'PyTorch and OnnxRuntime results are NOT close')

        output_paths.append(output_path)

    return output_paths
Example #20
0
def main(args):
    logger.info(f"Arguments:{args}")
    if args.precision == Precision.FLOAT16:
        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    torch.set_num_threads(
        psutil.cpu_count(
            logical=True) if args.thread_num <= 0 else args.thread_num)
    print(torch.__config__.parallel_info())

    cache_dir = args.cache_dir
    output_dir = args.onnx_dir
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    model_class = MODEL_CLASSES[args.model_class][0]

    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        torchscript=args.torchscript,
                                        cache_dir=cache_dir)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    # This scirpt does not support float16 for PyTorch.
    #if args.float16:
    #    model.half()

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.to(device)
    use_external_data_format = (config.n_layer > 24
                                )  #TODO: find a way to check model size > 2GB
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        has_past=True,
        new_folder=use_external_data_format)

    onnx_model_path = onnx_model_paths["raw"]
    use_padding = MODEL_CLASSES[args.model_class][2]
    Gpt2Helper.export_onnx(model,
                           device,
                           onnx_model_path,
                           args.verbose,
                           use_external_data_format,
                           has_position_ids=use_padding,
                           has_attention_mask=use_padding)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        onnx_model_path = onnx_model_paths[str(
            args.precision) if args.precision != Precision.INT8 else 'fp32']
        Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size,
                                 use_external_data_format)

        if args.precision == Precision.INT8:
            logger.info("quantizing model...")
            QuantizeHelper.quantize_onnx_model(onnx_model_path,
                                               onnx_model_paths["int8"],
                                               use_external_data_format)
            model = QuantizeHelper.quantize_torch_model(model)
            logger.info("finished quantizing model")
            onnx_model_path = onnx_model_paths["int8"]

    if args.torchscript:
        model = Gpt2Helper.torchscript(model,
                                       config,
                                       device,
                                       has_position_ids=use_padding,
                                       has_attention_mask=use_padding)

    session = create_onnxruntime_session(onnx_model_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         num_threads=args.thread_num,
                                         verbose=args.verbose)
    if session is None:
        return

    # Allocate output buffers for IO Binding
    max_output_shapes = Gpt2Helper.get_output_shapes(
        max(args.batch_sizes), max(args.past_sequence_lengths),
        max(args.sequence_lengths), config, args.model_class)
    output_buffers = Gpt2Helper.get_output_buffers(
        max_output_shapes, device, args.precision == Precision.FLOAT16)

    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(
        datetime.now().strftime("%Y%m%d-%H%M%S"))
    with open(csv_filename, mode="a", newline='') as csv_file:
        column_names = [
            "model_name", "model_class", "gpu", "precision", "optimizer",
            "torchscript", "batch_size", "sequence_length",
            "past_sequence_length", "torch_latency", "onnxruntime_latency",
            "onnxruntime_io_binding_latency"
        ]
        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
        csv_writer.writeheader()

        for batch_size in args.batch_sizes:
            for sequence_length in args.sequence_lengths:
                for past_sequence_length in args.past_sequence_lengths:
                    assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
                    logger.debug(
                        f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..."
                    )
                    dummy_inputs = Gpt2Helper.get_dummy_inputs(
                        batch_size,
                        past_sequence_length,
                        sequence_length,
                        config.num_attention_heads,
                        config.hidden_size,
                        config.n_layer,
                        config.vocab_size,
                        device,
                        float16=(args.precision == Precision.FLOAT16),
                        has_position_ids=use_padding,
                        has_attention_mask=use_padding)
                    output_shapes = Gpt2Helper.get_output_shapes(
                        batch_size, past_sequence_length, sequence_length,
                        config, args.model_class)

                    try:
                        outputs, torch_latency = Gpt2Helper.pytorch_inference(
                            model, dummy_inputs, args.test_times)
                        ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference(
                            session, dummy_inputs, args.test_times)
                        ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io(
                            session,
                            dummy_inputs,
                            output_buffers,
                            output_shapes,
                            args.test_times,
                            return_numpy=False,
                            include_copy_output_latency=args.
                            include_copy_output_latency)

                        if args.validate_onnx:
                            if Gpt2Helper.compare_outputs(
                                    outputs,
                                    ort_outputs,
                                    rtol=DEFAULT_TOLERANCE[args.precision],
                                    atol=DEFAULT_TOLERANCE[args.precision]):
                                logger.info(
                                    f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                                )

                            # Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
                            copy_outputs = []
                            for output in ort_io_outputs:
                                copy_outputs.append(output.cpu().numpy())

                            if Gpt2Helper.compare_outputs(
                                    outputs,
                                    copy_outputs,
                                    rtol=DEFAULT_TOLERANCE[args.precision],
                                    atol=DEFAULT_TOLERANCE[args.precision]):
                                logger.info(
                                    f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                                )

                        logger.info(
                            f"batch_size={batch_size}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, onnxruntime_latency={ort_latency:.2f}, onnxruntime_io_binding_latency={ort_io_latency:.2f}"
                        )

                        row = {
                            "model_name":
                            args.model_name_or_path,
                            "model_class":
                            args.model_class,
                            "gpu":
                            args.use_gpu,
                            "precision":
                            args.precision,
                            "optimizer":
                            args.optimize_onnx,
                            "torchscript":
                            args.torchscript,
                            "batch_size":
                            batch_size,
                            "sequence_length":
                            sequence_length,
                            "past_sequence_length":
                            past_sequence_length,
                            "torch_latency":
                            f"{torch_latency:.2f}",
                            "onnxruntime_latency":
                            f"{ort_latency:.2f}",
                            "onnxruntime_io_binding_latency":
                            f"{ort_io_latency:.2f}"
                        }
                        csv_writer.writerow(row)
                    except:
                        logger.error(f"Exception", exc_info=True)

    logger.info(f"Results are saved to file {csv_filename}")
    return csv_filename
Example #21
0
def main():
    from transformers import __version__ as transformers_version
    if version.parse(transformers_version) < version.parse(
            "3.1.0"):  # past_key_values name does not exist in 3.0.2 or older
        raise RuntimeError("This tool requires transformers 3.1.0 or later.")

    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    if args.use_external_data_format:
        assert not args.output.endswith('.onnx'), "output shall be a directory for --use_external_data_format"

    model_class = MODEL_CLASSES[args.model_class][0]
    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
        model_type = "beam_search_step"
    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
        model_type = "configurable_one_step_search"
    else:
        model_type = "default"

    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
    gpt2tester = Gpt2TesterFactory.create_tester(model_type)
    config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir)
    if model_type == 'beam_search_step':
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            batch_size=1,
                                            beam_size=args.beam_size,
                                            cache_dir=cache_dir)
    elif model_type == 'configurable_one_step_search':
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            batch_size=1,
                                            beam_size=args.beam_size,
                                            ignore_eos=args.ignore_eos,
                                            temperature=args.temperature,
                                            repetition_penalty=args.repetition_penalty,
                                            excluded_token_ids=args.excluded_token_ids,
                                            length_penalty=args.length_penalty,
                                            do_sample=args.do_sample,
                                            do_sample_top_p=args.do_sample_top_p,
                                            do_sample_top_k=args.do_sample_top_k,
                                            cache_dir=cache_dir)
    else:
        model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    if (not args.use_external_data_format) and (config.n_layer > 24):
        logger.info(f"Try --use_external_data_format when model size > 2GB")

    onnx_model_paths = gpt2helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class,
                                                 new_folder=args.use_external_data_format)

    raw_onnx_model = onnx_model_paths["raw"]

    logger.info(f"Exporting ONNX model to {raw_onnx_model}")
    use_padding = MODEL_CLASSES[args.model_class][2]
    gpt2helper.export_onnx(model,
                           device,
                           raw_onnx_model,
                           args.verbose,
                           args.use_external_data_format,
                           has_position_ids=use_padding,
                           has_attention_mask=use_padding)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else 'fp32']

        logger.info(f"Optimizing model to {output_path}")
        gpt2helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads, model.config.hidden_size,
                                 args.use_external_data_format)
    else:
        output_path = raw_onnx_model

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, onnx_model_paths['int8'], args.use_external_data_format)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")
        output_path = onnx_model_paths['int8']

    if args.output.endswith('.onnx') and output_path != args.output and not args.use_external_data_format:
        import shutil
        shutil.move(output_path, args.output)
        output_path = args.output

    logger.info(f"Output path: {output_path}")

    session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose)
    if session is not None:
        gpt2helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class,
                               has_position_ids=use_padding,
                               has_attention_mask=use_padding)

    if args.input_test_file:
        test_inputs = []
        # Each line of test file is a JSON string like:
        # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
        with open(args.input_test_file) as read_f:
            for _, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device)

                if use_padding:
                    if "attention_mask" in data:
                        numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                        attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"],
                                                                        dtype=numpy_float)).to(device)
                    else:
                        padding = -1
                        attention_mask = (
                            input_ids !=
                            padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32)
                        input_ids.masked_fill_(input_ids == padding, 0)

                    if "position_ids" in data:
                        position_ids = torch.from_numpy(numpy.asarray(data["position_ids"],
                                                                      dtype=numpy.int64)).to(device)
                    else:
                        position_ids = (attention_mask.long().cumsum(-1) - 1)
                        position_ids.masked_fill_(position_ids < 0, 0)

                    inputs = {"input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask}
                else:
                    inputs = {"input_ids": input_ids}

                if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
                    beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()

                    input_log_probs = torch.zeros([input_ids.shape[0], 1])
                    input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
                    inputs.update({
                        "beam_select_idx": beam_select_idx,
                        "input_log_probs": input_log_probs,
                        "input_unfinished_sents": input_unfinished_sents,
                    })

                test_inputs.append(inputs)

        gpt2tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose,
                                   save_test_data=3,
                                   save_test_data_dir=Path(output_path).parent)

    logger.info(f"Done. Output model: {output_path}")