def test_ort(args, device): model_name = args.model onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx") precision = 'fp32' if not onnx_model_path.endswith("_fp16.onnx") else 'fp16' model = load_torch_model(model_name, device) num_threads = args.num_threads session = benchmark_helper.create_onnxruntime_session(onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads) if session is None: raise RuntimeError(f"Failed to create ORT session from ONNX file {onnx_model_path}") description = onnx_model_path if (os.environ.get('ORT_LONGFORMER_COMPACT_MEMORY', '0') == "1"): description += "[compact_memory]" return test_ort_latency(device, model, model_name, description, session, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx, args.disable_io_binding, args.verbose)
def run_profile(onnx_model_path, use_gpu, thread_num, batch_size, sequence_length, samples=1, input_ids_name=None, segment_ids_name=None, input_mask_name=None, dummy_inputs=None): from benchmark_helper import create_onnxruntime_session session = create_onnxruntime_session(onnx_model_path, use_gpu, num_threads=thread_num, enable_profiling=True) if dummy_inputs is None: all_inputs = create_inputs(onnx_model_path, batch_size, sequence_length, samples, input_ids_name, segment_ids_name, input_mask_name) for inputs in all_inputs: _ = session.run(None, inputs) else: for i in range(samples): _ = session.run(None, dummy_inputs) profile_file = session.end_profiling() return profile_file
def validate_onnx_model(onnx_model_path, example_inputs, example_outputs_flatten, use_gpu, fp16): test_session = create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=False) if test_session is None: logger.error(f"{onnx_model_path} is an invalid ONNX model") return False logger.info(f"{onnx_model_path} is a valid ONNX model") # Compare the inference result with PyTorch or Tensorflow example_ort_inputs = {k: t.cpu().numpy() for k, t in example_inputs.items()} example_ort_outputs = test_session.run(None, example_ort_inputs) if len(example_outputs_flatten) != len(example_ort_outputs): logger.error( f"Number of output tensors expected {len(example_outputs_flatten)}, got {len(example_ort_outputs)}") return False for i in range(len(example_outputs_flatten)): abs_diff = numpy.amax(numpy.abs(example_ort_outputs[i] - example_outputs_flatten[i].cpu().numpy())) if abs_diff > 1e-4: logger.info(f"Max absolute diff={abs_diff} for output tensor {i}") rtol = 5e-02 if fp16 else 1e-4 atol = 1e-01 if fp16 else 1e-4 if not numpy.allclose(example_ort_outputs[i], example_outputs_flatten[i].cpu(), rtol=rtol, atol=atol): logger.error(f"Output tensor {i} is not close: rtol={rtol}, atol={atol}") return False logger.info(f"inference result of onnxruntime is validated on {onnx_model_path}") return True
def inference(model_path, dummy_inputs, outputs_path, use_gpu): environ_reset() environ_setting_nodes() environ_setting_paths(outputs_path) session = create_onnxruntime_session(model_path, use_gpu, enable_all_optimization=False) Gpt2Helper.onnxruntime_inference(session, dummy_inputs)
def test_ort(args, device) -> List[Dict[str, Any]]: model_name = args.model onnx_model_path = find_onnx_model( model_name) if not args.onnx else args.onnx optimized = onnx_model_path.endswith( "_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx") precision = "fp32" if not onnx_model_path.endswith( "_fp16.onnx") else "fp16" model = load_torch_model(model_name, device) num_threads = args.num_threads cuda_provider_options = {"arena_extend_strategy": "kSameAsRequested"} provider_options = {"CUDAExecutionProvider": cuda_provider_options} session = benchmark_helper.create_onnxruntime_session( onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads, provider_options=provider_options, ) if session is None: raise RuntimeError( f"Failed to create ORT session from ONNX file {onnx_model_path}") use_compact_memory = os.environ.get("ORT_LONGFORMER_COMPACT_MEMORY", "1") == "1" description = onnx_model_path if not use_compact_memory: description += "[non_compact_memory]" if args.use_half4: description += "[half4]" if precision == "fp16" else "[float4]" else: description += "[half2]" if precision == "fp16" else "[float4]" return test_ort_latency( device, model, model_name, description, session, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, optimized, precision, args.disable_io_binding, args.verbose, use_compact_memory, args.use_half4, args.disable_parity, )
def inference(): session = benchmark_helper.create_onnxruntime_session(onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads) dummy_inputs: LongforerInputs = LongformerHelper.get_dummy_inputs(batch_size, sequence_length, global_length, device) ort_inputs = dummy_inputs.get_ort_inputs() for _ in range(test_times): ort_outputs = session.run(None, ort_inputs)
def test_all(args): # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet). device = torch.device('cuda:0') results = [] for model_name in args.models: # Here we run an example input from transformers import LongformerModel torch_model_name_or_dir = MODELS[model_name] model = LongformerModel.from_pretrained( torch_model_name_or_dir) # pretrained model name or directory model.to(device) # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model optimized = False precision = 'fp32' onnx_model_path = model_name + ".onnx" optimized_fp32_model = model_name + "_fp32.onnx" optimized_fp16_model = model_name + "_fp16.onnx" import os.path if os.path.isfile(optimized_fp16_model): onnx_model_path = optimized_fp16_model optimized = True precision = 'fp16' elif os.path.isfile(optimized_fp32_model): onnx_model_path = optimized_fp32_model optimized = True for num_threads in args.num_threads: if "torch" in args.engines: results += test_torch(device, model, model_name, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads) if "onnxruntime" in args.engines: session = benchmark_helper.create_onnxruntime_session( onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads) results += test_onnxruntime(device, model, model_name, session, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, optimized, precision) return results
def run_profile(onnx_model_path, use_gpu, basic_optimization, thread_num, batch_size, sequence_length, all_inputs): from benchmark_helper import create_onnxruntime_session session = create_onnxruntime_session( onnx_model_path, use_gpu, enable_all_optimization=not basic_optimization, num_threads=thread_num, enable_profiling=True) for inputs in all_inputs: _ = session.run(None, inputs) profile_file = session.end_profiling() return profile_file
def inference(): # Update Arena strategy so that we can measure the minimum memory required cuda_provider_options = {"arena_extend_strategy": "kSameAsRequested"} provider_options = {"CUDAExecutionProvider": cuda_provider_options} session = benchmark_helper.create_onnxruntime_session( onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads, provider_options=provider_options, ) dummy_inputs: LongformerInputs = LongformerHelper.get_dummy_inputs( batch_size, sequence_length, global_length, device) ort_inputs = dummy_inputs.get_ort_inputs() for _ in range(test_times): _ = session.run(None, ort_inputs)
def run_onnxruntime( use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source, args, ): import onnxruntime results = [] if (use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()) and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results warm_up_repeat = 0 if provider == "tensorrt": optimizer_info = OptimizerInfo.NOOPT warm_up_repeat = 5 if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers( ): logger.error( "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance." ) return results if optimizer_info == OptimizerInfo.NOOPT: logger.warning( f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] args.model_type = MODELS[model_name][3] fusion_options = FusionOptions.parse(args) if "pt" in model_source: with torch.no_grad(): ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if "tf" in model_source: ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, provider, enable_all_optimization=True, num_threads=num_threads, verbose=verbose, ) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = [] device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size), ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if "pt" in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, config, input_value_type, ) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "providers": provider, "device": device, "optimizer": optimizer_info, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "custom_layer_num": config_modifier.get_layer_num(), "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort( ort_session, ort_inputs, result_template, repeat_times, batch_size, warm_up_repeat, ) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) output_buffer_max_sizes = [max_last_state_size] for i in range(len(ort_outputs)): if i == 2 and MODELS[model_name][3] == "gpt": # past state output max size output_buffer_max_sizes.append(max_pooler_size) else: output_buffer_max_sizes.append( max_last_state_size) data_type = numpy.longlong if "pt" in model_source else numpy.intc result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, output_buffer_max_sizes, batch_size, device, data_type, warm_up_repeat, ) logger.info(result) results.append(result) return results
def main(args): from transformers import __version__ as transformers_version if version.parse(transformers_version) < version.parse( "3.1.0"): # past_key_values name does not exist in 3.0.2 or older raise RuntimeError("This tool requires transformers 3.1.0 or later.") logger.info(f"Arguments:{args}") if args.precision == Precision.FLOAT16: assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" torch.set_num_threads( psutil.cpu_count( logical=True) if args.thread_num <= 0 else args.thread_num) print(torch.__config__.parallel_info()) cache_dir = args.cache_dir output_dir = args.onnx_dir prepare_environment(cache_dir, output_dir, args.use_gpu) model_class = MODEL_CLASSES[args.model_class][0] if args.model_class == "GPT2LMHeadModel_BeamSearchStep": model_type = "beam_search_step" elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch": model_type = "configurable_one_step_search" else: model_type = "default" gpt2helper = Gpt2HelperFactory.create_helper(model_type) config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir) if model_type == "beam_search_step": model = model_class.from_pretrained( args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, cache_dir=cache_dir, ) elif model_type == "configurable_one_step_search": model = model_class.from_pretrained( args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, ignore_eos=args.ignore_eos, temperature=args.temperature, repetition_penalty=args.repetition_penalty, excluded_token_ids=args.excluded_token_ids, length_penalty=args.length_penalty, do_sample=args.do_sample, do_sample_top_p=args.do_sample_top_p, do_sample_top_k=args.do_sample_top_k, cache_dir=cache_dir, ) else: model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) # This scirpt does not support float16 for PyTorch. # if args.float16: # model.half() device = torch.device("cuda:0" if args.use_gpu else "cpu") model.to(device) use_external_data_format = config.n_layer > 24 # TODO: find a way to check model size > 2GB onnx_model_paths = gpt2helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, has_past=True, new_folder=use_external_data_format, ) onnx_model_path = onnx_model_paths["raw"] use_padding = MODEL_CLASSES[args.model_class][2] gpt2helper.export_onnx( model, device, onnx_model_path, args.verbose, use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding, ) if args.optimize_onnx or args.precision != Precision.FLOAT32: onnx_model_path = onnx_model_paths[str( args.precision) if args.precision != Precision.INT8 else "fp32"] gpt2helper.optimize_onnx( onnx_model_paths["raw"], onnx_model_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, use_external_data_format, auto_mixed_precision=True, ) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_paths["int8"], use_external_data_format) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") onnx_model_path = onnx_model_paths["int8"] if args.torchscript: model = gpt2helper.torchscript( model, config, device, has_position_ids=use_padding, has_attention_mask=use_padding, ) session = create_onnxruntime_session( onnx_model_path, args.use_gpu, enable_all_optimization=False, num_threads=args.thread_num, verbose=args.verbose, ) if session is None: return # Allocate output buffers for IO Binding if model_type == "beam_search_step" or model_type == "configurable_one_step_search": max_output_shapes = gpt2helper.get_output_shapes( max(args.batch_sizes), context_len=max(args.past_sequence_lengths), past_sequence_length=max(args.past_sequence_lengths), sequence_length=max(args.sequence_lengths), beam_size=args.beam_size, step=0, config=config, model_class=args.model_class, ) output_buffers = gpt2helper.get_output_buffers( max_output_shapes, device, args.precision == Precision.FLOAT16) else: max_output_shapes = gpt2helper.get_output_shapes( max(args.batch_sizes), max(args.past_sequence_lengths), max(args.sequence_lengths), config, args.model_class, ) output_buffers = gpt2helper.get_output_buffers( max_output_shapes, device, args.precision == Precision.FLOAT16) csv_filename = args.result_csv or "benchmark_result_{}.csv".format( datetime.now().strftime("%Y%m%d-%H%M%S")) with open(csv_filename, mode="a", newline="") as csv_file: column_names = [ "model_name", "model_class", "gpu", "precision", "optimizer", "torchscript", "batch_size", "sequence_length", "past_sequence_length", "torch_latency", "onnxruntime_latency", "onnxruntime_io_binding_latency", ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) csv_writer.writeheader() for batch_size in args.batch_sizes: for sequence_length in args.sequence_lengths: for past_sequence_length in args.past_sequence_lengths: assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0 logger.debug( f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..." ) if model_type == "beam_search_step" or model_type == "configurable_one_step_search": dummy_inputs = gpt2helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, float16=(args.precision == Precision.FLOAT16), has_position_ids=use_padding, has_attention_mask=use_padding, ) output_shapes = gpt2helper.get_output_shapes( batch_size, past_sequence_length, past_sequence_length, sequence_length, args.beam_size, 0, config, args.model_class, ) else: dummy_inputs = gpt2helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, float16=(args.precision == Precision.FLOAT16), has_position_ids=use_padding, has_attention_mask=use_padding, ) output_shapes = gpt2helper.get_output_shapes( batch_size, past_sequence_length, sequence_length, config, args.model_class, ) try: outputs, torch_latency = gpt2helper.pytorch_inference( model, dummy_inputs, args.test_times) # Dump Torch output shape for i, value in enumerate(outputs): if isinstance(value, tuple): logger.debug( f"torch output {i} is tuple of size {len(value)}, shape {value[0].shape}" ) else: logger.debug( f"torch output {i} shape {value.shape}") ort_outputs, ort_latency = gpt2helper.onnxruntime_inference( session, dummy_inputs, args.test_times) ( ort_io_outputs, ort_io_latency, ) = gpt2helper.onnxruntime_inference_with_binded_io( session, dummy_inputs, output_buffers, output_shapes, args.test_times, return_numpy=False, include_copy_output_latency=args. include_copy_output_latency, ) if args.validate_onnx: if gpt2helper.compare_outputs( outputs, ort_outputs, model_class=args.model_class, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision], ): logger.info( f"Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]})." ) # Results of IO binding might be in GPU. Copy outputs to CPU for comparison. copy_outputs = [] for output in ort_io_outputs: copy_outputs.append(output.cpu().numpy()) if gpt2helper.compare_outputs( outputs, copy_outputs, model_class=args.model_class, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision], ): logger.info( f"Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]})." ) logger.info( f"batch_size={batch_size}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, onnxruntime_latency={ort_latency:.2f}, onnxruntime_io_binding_latency={ort_io_latency:.2f}" ) row = { "model_name": args.model_name_or_path, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "torchscript": args.torchscript, "batch_size": batch_size, "sequence_length": sequence_length, "past_sequence_length": past_sequence_length, "torch_latency": f"{torch_latency:.2f}", "onnxruntime_latency": f"{ort_latency:.2f}", "onnxruntime_io_binding_latency": f"{ort_io_latency:.2f}", } csv_writer.writerow(row) except: logger.error(f"Exception", exc_info=True) return None logger.info(f"Results are saved to file {csv_filename}") return csv_filename
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=False, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class) logger.info(f"Done. Output model: {output_path}")
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source): import onnxruntime results = [] if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): logger.warning( "Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] if 'pt' in model_source: with torch.no_grad(): onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) if 'tf' in model_source: onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, enable_all_optimization=True, num_threads=num_threads, verbose=verbose) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = {"last_state": None, "pooler": None} device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size) ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, input_value_type) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "device": device, "optimizer": optimize_onnx, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) data_type = numpy.longlong if 'pt' in model_source else numpy.int32 result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, max_last_state_size, max_pooler_size, batch_size, device, data_type) logger.info(result) results.append(result) return results
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) use_external_data_format = (config.n_layer > 24 ) #TODO: find a way to check model size > 2GB onnx_model_paths = Gpt2Helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, new_folder=use_external_data_format) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] logger.info(f"Exporting ONNX model to {raw_onnx_model}") use_padding = MODEL_CLASSES[args.model_class][2] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose, use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) if args.optimize_onnx or args.precision != Precision.FLOAT32: logger.info(f"Optimizing model to {output_path}") Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding) if args.input_test_file: test_inputs = [] # Each line of test file is a JSON string like: # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]} with open(args.input_test_file) as read_f: for i, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) if use_padding: if "attention_mask" in data: numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 attention_mask = ( input_ids != padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) else: position_ids = (attention_mask.long().cumsum(-1) - 1) position_ids.masked_fill_(position_ids < 0, 0) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } else: inputs = {"input_ids": input_ids} test_inputs.append(inputs) Gpt2Tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose) logger.info(f"Done. Output model: {output_path}")
def main(): args = parse_arguments() setup_logger(args.verbose) logger.info(f"Arguments:{args}") if args.precision == Precision.FLOAT16: assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" torch.set_num_threads( psutil.cpu_count( logical=True) if args.thread_num <= 0 else args.thread_num) print(torch.__config__.parallel_info()) cache_dir = args.cache_dir output_dir = args.onnx_dir prepare_environment(cache_dir, output_dir, args.use_gpu) model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name, torchscript=args.torchscript, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name, config=config, cache_dir=cache_dir) # This scirpt does not support float16 for PyTorch. #if args.float16: # model.half() device = torch.device("cuda:0" if args.use_gpu else "cpu") model.to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name, args.model_class) onnx_model_path = onnx_model_paths["raw"] Gpt2Helper.export_onnx(model, device, onnx_model_path, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: onnx_model_path = onnx_model_paths[str(args.precision)] Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") if args.torchscript: model = Gpt2Helper.torchscript(model, config, device) session = create_onnxruntime_session(onnx_model_path, args.use_gpu, enable_all_optimization=False, num_threads=args.thread_num, verbose=args.verbose) if session is None: return # One word is generated for each inference. This length does not include that of past state. sequence_length = 1 # Allocate output buffers for IO Binding max_output_shapes = Gpt2Helper.get_output_shapes( max(args.batch_sizes), max(args.past_sequence_lengths), sequence_length, config, args.model_class) output_buffers = Gpt2Helper.get_output_buffers( max_output_shapes, device, args.precision == Precision.FLOAT16) csv_filename = args.result_csv or "benchmark_result_{}.csv".format( datetime.now().strftime("%Y%m%d-%H%M%S")) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "model_name", "model_class", "gpu", "precision", "optimizer", "torchscript", "batch_size", "past_sequence_length", "torch_latency", "ort_latency", "ort_io_latency" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) csv_writer.writeheader() for batch_size in args.batch_sizes: for past_sequence_length in args.past_sequence_lengths: logger.debug( f"Running test for batch_size={batch_size} past_sequence_length={past_sequence_length}..." ) dummy_inputs = Gpt2Helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, args.precision == Precision.FLOAT16) output_shapes = Gpt2Helper.get_output_shapes( batch_size, past_sequence_length, sequence_length, config, args.model_class) try: outputs, torch_latency = Gpt2Helper.pytorch_inference( model, dummy_inputs, args.test_times) ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference( session, dummy_inputs, args.test_times) ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io( session, dummy_inputs, output_buffers, output_shapes, args.test_times) if args.validate_onnx: if Gpt2Helper.compare_outputs( outputs, ort_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) if Gpt2Helper.compare_outputs( outputs, ort_io_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) logger.info( f"batch_size={batch_size}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, ort_latency={ort_latency:.2f}, ort_io_latency={ort_io_latency:.2f}" ) row = { "model_name": args.model_name, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "torchscript": args.torchscript, "batch_size": batch_size, "past_sequence_length": past_sequence_length, "torch_latency": f"{torch_latency:.2f}", "ort_latency": f"{ort_latency:.2f}", "ort_io_latency": f"{ort_io_latency:.2f}" } csv_writer.writerow(row) except: logger.error(f"Exception", exc_info=True) logger.info(f"Results are saved to file {csv_filename}")
def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"): result = {} from transformers import __version__ as transformers_version if version.parse(transformers_version) < version.parse( "3.1.0"): # past_key_values name does not exist in 3.0.2 or older raise RuntimeError("This tool requires transformers 3.1.0 or later.") args = parse_arguments(argv) setup_logger(args.verbose) if not experiment_name: import sys experiment_name = " ".join(argv if argv else sys.argv[1:]) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" if args.use_external_data_format: assert not args.output.endswith( '.onnx' ), "output shall be a directory for --use_external_data_format" model_class = MODEL_CLASSES[args.model_class][0] use_padding = MODEL_CLASSES[args.model_class][2] if args.model_class == "GPT2LMHeadModel_BeamSearchStep": model_type = "beam_search_step" elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch": model_type = "configurable_one_step_search" else: model_type = "default" gpt2helper = Gpt2HelperFactory.create_helper(model_type) gpt2tester = Gpt2TesterFactory.create_tester(model_type) config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if model_type == 'beam_search_step': model = model_class.from_pretrained(args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, cache_dir=cache_dir) elif model_type == 'configurable_one_step_search': model = model_class.from_pretrained( args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, ignore_eos=args.ignore_eos, temperature=args.temperature, repetition_penalty=args.repetition_penalty, excluded_token_ids=args.excluded_token_ids, length_penalty=args.length_penalty, do_sample=args.do_sample, do_sample_top_p=args.do_sample_top_p, do_sample_top_k=args.do_sample_top_k, cache_dir=cache_dir) else: model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) if (not args.use_external_data_format) and (config.n_layer > 24): logger.info(f"Try --use_external_data_format when model size > 2GB") onnx_model_paths = gpt2helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, new_folder=args.use_external_data_format, remove_existing=[ "fp32", "fp16", "int8" ]) # Do not remove raw model to save time in parity test raw_onnx_model = onnx_model_paths["raw"] if os.path.exists(raw_onnx_model): logger.warning( f"Skip exporting ONNX model since it existed: {raw_onnx_model}") else: logger.info(f"Exporting ONNX model to {raw_onnx_model}") gpt2helper.export_onnx(model, device, raw_onnx_model, args.verbose, args.use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding, input_ids_dtype=torch.int32 if args.use_int32_inputs else torch.int64, position_ids_dtype=torch.int32 if args.use_int32_inputs else torch.int64, attention_mask_dtype=torch.int32 if args.use_int32_inputs else torch.int64) fp16_params = {"keep_io_types": args.keep_io_types} if args.io_block_list: fp16_params["keep_io_types"] = args.io_block_list if args.node_block_list: fp16_params["node_block_list"] = args.node_block_list if args.op_block_list: fp16_params["op_block_list"] = args.op_block_list if args.force_fp16_initializers: fp16_params["force_fp16_initializers"] = args.force_fp16_initializers is_io_float16 = (args.precision == Precision.FLOAT16 and not args.keep_io_types) if args.optimize_onnx or args.precision != Precision.FLOAT32: output_path = onnx_model_paths[str(args.precision) if args. precision != Precision.INT8 else 'fp32'] logger.info(f"Optimizing model to {output_path}") gpt2helper.optimize_onnx( raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, args.use_external_data_format, auto_mixed_precision=args.auto_mixed_precision, **fp16_params) else: output_path = raw_onnx_model if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, onnx_model_paths['int8'], args.use_external_data_format) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") output_path = onnx_model_paths['int8'] if args.output.endswith( '.onnx' ) and output_path != args.output and not args.use_external_data_format: import shutil shutil.move(output_path, args.output) output_path = args.output logger.info(f"Output path: {output_path}") model_size_in_MB = int( get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024) session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if args.model_class == "GPT2LMHeadModel" and session is not None: parity_result = gpt2helper.test_parity( session, model, device, is_io_float16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding, input_ids_dtype=torch.int32 if args.use_int32_inputs else torch.int64, position_ids_dtype=torch.int32 if args.use_int32_inputs else torch.int64, attention_mask_dtype=torch.int32 if args.use_int32_inputs else torch.int64, test_cases_per_run=args.test_cases, total_runs=args.test_runs, verbose=args.verbose) latency = gpt2helper.test_performance( session, model, device, is_io_float16, total_runs=100, use_io_binding=True, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding, input_ids_dtype=torch.int32 if args.use_int32_inputs else torch.int64, position_ids_dtype=torch.int32 if args.use_int32_inputs else torch.int64, attention_mask_dtype=torch.int32 if args.use_int32_inputs else torch.int64, batch_size=8, sequence_length=1, past_sequence_length=32) if args.precision == Precision.FLOAT16: logger.info(f"fp16 conversion parameters:{fp16_params}") # Write results to file import csv from onnxruntime import __version__ as ort_version latency_name = get_latency_name() csv_file_existed = os.path.exists(csv_filename) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases", "runs", "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers", "auto_mixed_precision", "ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "top1_match_rate", "onnx_size_in_MB", "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate_per_run" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) if not csv_file_existed: csv_writer.writeheader() row = { "experiment": experiment_name, "run_id": run_id, "model_name": args.model_name_or_path, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "test_cases": args.test_cases, "runs": args.test_runs, "keep_io_types": args.keep_io_types, "io_block_list": args.io_block_list, "op_block_list": args.op_block_list, "node_block_list": args.node_block_list, "force_fp16_initializers": args.force_fp16_initializers, "auto_mixed_precision": args.auto_mixed_precision, "ORT_TRANSFORMER_OPTIONS": os.getenv('ORT_TRANSFORMER_OPTIONS'), "ORT_CUDA_GEMM_OPTIONS": os.getenv('ORT_CUDA_GEMM_OPTIONS'), "onnxruntime": ort_version, latency_name: f"{latency:.2f}", "diff_50_percentile": parity_result["max_diff_percentile_50"], "diff_90_percentile": parity_result["max_diff_percentile_90"], "diff_95_percentile": parity_result["max_diff_percentile_95"], "diff_99_percentile": parity_result["max_diff_percentile_99"], "diff_pass_rate": parity_result["diff_pass_rate"], "nan_rate": parity_result["nan_rate"], "top1_match_rate": parity_result["top1_match_rate"], "top1_match_rate_per_run": parity_result["top1_match_rate_per_run"], "onnx_size_in_MB": "{}".format(model_size_in_MB), } logger.info(f"result: {row}") result.update(row) csv_writer.writerow(row) if args.input_test_file: test_inputs = [] # Each line of test file is a JSON string like: # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]} with open(args.input_test_file) as read_f: for _, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) if use_padding: if "attention_mask" in data: numpy_float = numpy.float16 if is_io_float16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 attention_mask = (input_ids != padding).type( torch.float16 if is_io_float16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) else: position_ids = (attention_mask.long().cumsum(-1) - 1) position_ids.masked_fill_(position_ids < 0, 0) inputs = { "input_ids": input_ids.to(torch.int32) if args.use_int32_inputs else input_ids, "position_ids": position_ids.to(torch.int32) if args.use_int32_inputs else position_ids, "attention_mask": attention_mask.to(torch.int32) if args.use_int32_inputs else attention_mask } else: inputs = { "input_ids": input_ids.to(torch.int32) if args.use_int32_inputs else input_ids } if model_type == "beam_search_step" or model_type == "configurable_one_step_search": beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long() input_log_probs = torch.zeros([input_ids.shape[0], 1]) input_unfinished_sents = torch.ones( [input_ids.shape[0], 1], dtype=torch.bool) inputs.update({ "beam_select_idx": beam_select_idx, "input_log_probs": input_log_probs, "input_unfinished_sents": input_unfinished_sents, }) test_inputs.append(inputs) gpt2tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose, save_test_data=3, save_test_data_dir=Path(output_path).parent) logger.info(f"Done. Output model: {output_path}") return result
def test_all(args): # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet). device = torch.device('cuda:0') results = [] for model_name in args.models: # Here we run an example input from transformers import LongformerModel torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name] model = LongformerModel.from_pretrained( torch_model_name_or_dir) # pretrained model name or directory model.to(device) # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model # TODO: call convert_longformer_to_onnx to export onnx instead. import os.path optimized = False precision = 'fp32' onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx") optimized_fp32_model = os.path.join(args.onnx_dir, model_name + "_fp32.onnx") optimized_fp16_model = os.path.join(args.onnx_dir, model_name + "_fp16.onnx") if os.path.isfile(optimized_fp16_model): onnx_model_path = optimized_fp16_model optimized = True precision = 'fp16' elif os.path.isfile(optimized_fp32_model): onnx_model_path = optimized_fp32_model optimized = True print("ONNX model path:", onnx_model_path) for num_threads in args.num_threads: if "torch" in args.engines: results += test_torch_latency(device, model, model_name, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, args.verbose) if "onnxruntime" in args.engines: if args.memory: test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0], args.global_lengths[0], args.test_times, num_threads) else: # test latency session = benchmark_helper.create_onnxruntime_session( onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads) if session is None: raise RuntimeError( f"Failed to create ORT sesssion from ONNX file {onnx_model_path}" ) results += test_ort_latency( device, model, model_name, session, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx, args.disable_io_binding, args.verbose) return results
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=False, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class) if args.input_test_file: test_inputs = [] with open(args.input_test_file) as read_f: for i, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } test_inputs.append(inputs) Gpt2Tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose) logger.info(f"Done. Output model: {output_path}")
def export_onnx_models(model_name_or_path, cache_dir, output_dir, use_gpu, use_external_data_format, optimize_onnx, precision, verbose, use_decoder_start_token: bool = True, merge_encoder_and_decoder_init: bool = True, overwrite: bool = False): device = torch.device("cuda:0" if use_gpu else "cpu") models = T5Helper.load_model(model_name_or_path, cache_dir, device, merge_encoder_and_decoder_init) config = models["decoder"].config if (not use_external_data_format) and (config.num_layers > 24): logger.info(f"Try use_external_data_format when model size > 2GB") output_paths = [] for name, model in models.items(): filename_suffix = "_" + name onnx_path = T5Helper.get_onnx_path(output_dir, model_name_or_path, suffix=filename_suffix, new_folder=use_external_data_format) if overwrite or not os.path.exists(onnx_path): logger.info(f"Exporting ONNX model to {onnx_path}") # We have to clone model before exporting onnx, otherwise verify_onnx will report large difference. T5Helper.export_onnx(copy.deepcopy(model), device, onnx_path, verbose, use_external_data_format, use_decoder_input_ids=not use_decoder_start_token) else: logger.info(f"Skip exporting: existed ONNX model {onnx_path}") # Optimize ONNX graph. Note that we have not implemented graph optimization for T5 yet. if optimize_onnx or precision != Precision.FLOAT32: output_path = T5Helper.get_onnx_path(output_dir, model_name_or_path, suffix=filename_suffix + "_" + str(precision), new_folder=use_external_data_format) if overwrite or not os.path.exists(output_path): logger.info(f"Optimizing model to {output_path}") T5Helper.optimize_onnx(onnx_path, output_path, precision == Precision.FLOAT16, config.num_heads, config.hidden_size, use_external_data_format) else: logger.info(f"Skip optimizing: existed ONNX model {onnx_path}") else: output_path = onnx_path ort_session = create_onnxruntime_session( output_path, use_gpu=use_gpu, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider']) max_diff = T5Helper.verify_onnx(model, ort_session, device) logger.info(f'PyTorch and OnnxRuntime results max difference = {max_diff}') if max_diff > 1e-4: logger.warn(f'PyTorch and OnnxRuntime results are NOT close') output_paths.append(output_path) return output_paths
def main(args): logger.info(f"Arguments:{args}") if args.precision == Precision.FLOAT16: assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" torch.set_num_threads( psutil.cpu_count( logical=True) if args.thread_num <= 0 else args.thread_num) print(torch.__config__.parallel_info()) cache_dir = args.cache_dir output_dir = args.onnx_dir prepare_environment(cache_dir, output_dir, args.use_gpu) model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir) model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) # This scirpt does not support float16 for PyTorch. #if args.float16: # model.half() device = torch.device("cuda:0" if args.use_gpu else "cpu") model.to(device) use_external_data_format = (config.n_layer > 24 ) #TODO: find a way to check model size > 2GB onnx_model_paths = Gpt2Helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, has_past=True, new_folder=use_external_data_format) onnx_model_path = onnx_model_paths["raw"] use_padding = MODEL_CLASSES[args.model_class][2] Gpt2Helper.export_onnx(model, device, onnx_model_path, args.verbose, use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) if args.optimize_onnx or args.precision != Precision.FLOAT32: onnx_model_path = onnx_model_paths[str( args.precision) if args.precision != Precision.INT8 else 'fp32'] Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, use_external_data_format) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_paths["int8"], use_external_data_format) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") onnx_model_path = onnx_model_paths["int8"] if args.torchscript: model = Gpt2Helper.torchscript(model, config, device, has_position_ids=use_padding, has_attention_mask=use_padding) session = create_onnxruntime_session(onnx_model_path, args.use_gpu, enable_all_optimization=False, num_threads=args.thread_num, verbose=args.verbose) if session is None: return # Allocate output buffers for IO Binding max_output_shapes = Gpt2Helper.get_output_shapes( max(args.batch_sizes), max(args.past_sequence_lengths), max(args.sequence_lengths), config, args.model_class) output_buffers = Gpt2Helper.get_output_buffers( max_output_shapes, device, args.precision == Precision.FLOAT16) csv_filename = args.result_csv or "benchmark_result_{}.csv".format( datetime.now().strftime("%Y%m%d-%H%M%S")) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "model_name", "model_class", "gpu", "precision", "optimizer", "torchscript", "batch_size", "sequence_length", "past_sequence_length", "torch_latency", "onnxruntime_latency", "onnxruntime_io_binding_latency" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) csv_writer.writeheader() for batch_size in args.batch_sizes: for sequence_length in args.sequence_lengths: for past_sequence_length in args.past_sequence_lengths: assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0 logger.debug( f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..." ) dummy_inputs = Gpt2Helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, float16=(args.precision == Precision.FLOAT16), has_position_ids=use_padding, has_attention_mask=use_padding) output_shapes = Gpt2Helper.get_output_shapes( batch_size, past_sequence_length, sequence_length, config, args.model_class) try: outputs, torch_latency = Gpt2Helper.pytorch_inference( model, dummy_inputs, args.test_times) ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference( session, dummy_inputs, args.test_times) ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io( session, dummy_inputs, output_buffers, output_shapes, args.test_times, return_numpy=False, include_copy_output_latency=args. include_copy_output_latency) if args.validate_onnx: if Gpt2Helper.compare_outputs( outputs, ort_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) # Results of IO binding might be in GPU. Copy outputs to CPU for comparison. copy_outputs = [] for output in ort_io_outputs: copy_outputs.append(output.cpu().numpy()) if Gpt2Helper.compare_outputs( outputs, copy_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) logger.info( f"batch_size={batch_size}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, onnxruntime_latency={ort_latency:.2f}, onnxruntime_io_binding_latency={ort_io_latency:.2f}" ) row = { "model_name": args.model_name_or_path, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "torchscript": args.torchscript, "batch_size": batch_size, "sequence_length": sequence_length, "past_sequence_length": past_sequence_length, "torch_latency": f"{torch_latency:.2f}", "onnxruntime_latency": f"{ort_latency:.2f}", "onnxruntime_io_binding_latency": f"{ort_io_latency:.2f}" } csv_writer.writerow(row) except: logger.error(f"Exception", exc_info=True) logger.info(f"Results are saved to file {csv_filename}") return csv_filename
def main(): from transformers import __version__ as transformers_version if version.parse(transformers_version) < version.parse( "3.1.0"): # past_key_values name does not exist in 3.0.2 or older raise RuntimeError("This tool requires transformers 3.1.0 or later.") args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith(".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" if args.use_external_data_format: assert not args.output.endswith('.onnx'), "output shall be a directory for --use_external_data_format" model_class = MODEL_CLASSES[args.model_class][0] if args.model_class == "GPT2LMHeadModel_BeamSearchStep": model_type = "beam_search_step" elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch": model_type = "configurable_one_step_search" else: model_type = "default" gpt2helper = Gpt2HelperFactory.create_helper(model_type) gpt2tester = Gpt2TesterFactory.create_tester(model_type) config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if model_type == 'beam_search_step': model = model_class.from_pretrained(args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, cache_dir=cache_dir) elif model_type == 'configurable_one_step_search': model = model_class.from_pretrained(args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, ignore_eos=args.ignore_eos, temperature=args.temperature, repetition_penalty=args.repetition_penalty, excluded_token_ids=args.excluded_token_ids, length_penalty=args.length_penalty, do_sample=args.do_sample, do_sample_top_p=args.do_sample_top_p, do_sample_top_k=args.do_sample_top_k, cache_dir=cache_dir) else: model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) if (not args.use_external_data_format) and (config.n_layer > 24): logger.info(f"Try --use_external_data_format when model size > 2GB") onnx_model_paths = gpt2helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class, new_folder=args.use_external_data_format) raw_onnx_model = onnx_model_paths["raw"] logger.info(f"Exporting ONNX model to {raw_onnx_model}") use_padding = MODEL_CLASSES[args.model_class][2] gpt2helper.export_onnx(model, device, raw_onnx_model, args.verbose, args.use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) if args.optimize_onnx or args.precision != Precision.FLOAT32: output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else 'fp32'] logger.info(f"Optimizing model to {output_path}") gpt2helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, args.use_external_data_format) else: output_path = raw_onnx_model if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, onnx_model_paths['int8'], args.use_external_data_format) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") output_path = onnx_model_paths['int8'] if args.output.endswith('.onnx') and output_path != args.output and not args.use_external_data_format: import shutil shutil.move(output_path, args.output) output_path = args.output logger.info(f"Output path: {output_path}") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if session is not None: gpt2helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding) if args.input_test_file: test_inputs = [] # Each line of test file is a JSON string like: # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]} with open(args.input_test_file) as read_f: for _, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy(numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) if use_padding: if "attention_mask" in data: numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 attention_mask = ( input_ids != padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: position_ids = torch.from_numpy(numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) else: position_ids = (attention_mask.long().cumsum(-1) - 1) position_ids.masked_fill_(position_ids < 0, 0) inputs = {"input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask} else: inputs = {"input_ids": input_ids} if model_type == "beam_search_step" or model_type == "configurable_one_step_search": beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long() input_log_probs = torch.zeros([input_ids.shape[0], 1]) input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool) inputs.update({ "beam_select_idx": beam_select_idx, "input_log_probs": input_log_probs, "input_unfinished_sents": input_unfinished_sents, }) test_inputs.append(inputs) gpt2tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose, save_test_data=3, save_test_data_dir=Path(output_path).parent) logger.info(f"Done. Output model: {output_path}")