def create_model_inference_app( model_path: str, batch_size: int, num_cores: int, address: str, port: str ) -> flask.Flask: print(f"Compiling model at {model_path}") engine = compile_model(model_path, batch_size, num_cores) print(engine) app = flask.Flask(__name__) CORS(app) @app.route("/predict", methods=["POST"]) def predict(): data = flask.request.get_data() inputs = bytes_to_tensors(data) print(f"Received {len(inputs)} inputs from client") print("Executing model") outputs, elapsed_time = engine.timed_run(inputs) print(f"Inference time took {elapsed_time * 1000.0:.4f} milliseconds") print(f"Produced {len(outputs)} output tensors") return tensors_to_bytes(outputs) @app.route("/info", methods=["GET"]) def info(): return flask.jsonify({"model_path": model_path, "engine": repr(engine)}) print("Starting Flask app") app.run(host=address, port=port, debug=False, threaded=True)
def main(): args = parse_args() model = fetch_model(args.model_name) batch_size = args.batch_size num_cores = args.num_cores # Gather batch of data batch = model.sample_batch(batch_size=batch_size) batched_inputs = batch["inputs"] batched_outputs = batch["outputs"] # Compile model for inference print("Compiling {} model with DeepSparse Engine".format(model.architecture_id)) engine = compile_model(model, batch_size, num_cores) print(engine) # INFERENCE # Record output from inference through the DeepSparse Engine print("Executing...") predicted_outputs = engine(batched_inputs) # Compare against reference model output verify_outputs(predicted_outputs, batched_outputs) # BENCHMARK # Record output from executing through the DeepSparse engine print("Benchmarking...") results = engine.benchmark(batched_inputs) print(results)
def main(): args = parse_args() onnx_filepath = args.onnx_filepath batch_size = args.batch_size inputs = generate_random_inputs(onnx_filepath, batch_size) input_names = get_input_names(onnx_filepath) output_names = get_output_names(onnx_filepath) inputs_dict = {name: value for name, value in zip(input_names, inputs)} # ONNXRuntime inference print("Executing model with ONNXRuntime...") sess_options = onnxruntime.SessionOptions() with override_onnx_batch_size(onnx_filepath, batch_size) as override_onnx_filepath: ort_network = onnxruntime.InferenceSession(override_onnx_filepath, sess_options) ort_outputs = ort_network.run(output_names, inputs_dict) # DeepSparse Engine inference print("Executing model with DeepSparse Engine...") dse_network = compile_model(onnx_filepath, batch_size=batch_size) dse_outputs = dse_network(inputs) verify_outputs(dse_outputs, ort_outputs) print("DeepSparse Engine output matches ONNXRuntime output")
def test_engine(self, model: Model, batch_size: int): """ Test the Engine.inference interfaces """ m = model() batch = m.sample_batch(batch_size=batch_size) inputs = batch["inputs"] outputs = batch["outputs"] print("compile model") engine = compile_model(m, batch_size) print("engine callable") pred_outputs = engine(inputs) verify_outputs(pred_outputs, outputs) print("engine run") pred_outputs = engine.run(inputs) verify_outputs(pred_outputs, outputs) print("engine mapped_run") pred_outputs = engine.mapped_run(inputs) assert len(pred_outputs) == len(outputs) print("engine timed_run") pred_outputs, elapsed = engine.timed_run(inputs) verify_outputs(pred_outputs, outputs)
def engine_flask_server( model_path: str, batch_size: int = 1, num_cores: int = None, scheduler: Scheduler = Scheduler.multi_stream, address: str = "0.0.0.0", port: str = "5543", ) -> flask.Flask: """ :param model_path: Either a path to the model's onnx file, a SparseZoo model stub prefixed by 'zoo:', a SparseZoo Model object, or a SparseZoo ONNX File object that defines the neural network :param batch_size: The batch size of the inputs to be used with the model :param num_cores: The number of physical cores to run the model on. Pass None or 0 to run on the max number of cores in one socket for the current machine, default None :param scheduler: The kind of scheduler to execute with. Defaults to multi_stream :param address: IP address to run on. Default is 0.0.0.0 :param port: port to run on. Default is 5543 :return: launches a flask server on the given address and port can run the given model on the DeepSparse engine via HTTP requests """ _LOGGER.info(f"Compiling model at {model_path}") engine = compile_model(model_path, batch_size, num_cores, scheduler=scheduler) _LOGGER.info(engine) app = flask.Flask(__name__) CORS(app) @app.route("/run", methods=["POST"]) def run(): data = flask.request.get_data() inputs = bytes_to_arrays(data) _LOGGER.info(f"Received {len(inputs)} inputs from client") _LOGGER.info("Executing model") outputs, elapsed_time = engine.timed_run(inputs) _LOGGER.info( f"Inference time took {elapsed_time * 1000.0:.4f} milliseconds") _LOGGER.info(f"Produced {len(outputs)} output tensors") return arrays_to_bytes(outputs) @app.route("/info", methods=["GET"]) def info(): return flask.jsonify({ "model_path": model_path, "engine": repr(engine) }) _LOGGER.info("Starting Flask app") app.run(host=address, port=port, debug=False, threaded=True)
def create_and_run_model_server( args, model_path: str, batch_size: int, num_cores: int, address: str, port: str ) -> flask.Flask: print(f"Compiling model at {model_path}") engine = compile_model(model_path, batch_size, num_cores) print(engine) postprocessor = YoloPostprocessor() app = flask.Flask(__name__) CORS(app) @app.route("/predict", methods=["POST"]) def predict(): # load raw images raw_data = flask.request.get_data() inputs = bytes_to_arrays(raw_data) print(f"Received {len(inputs)} images from client") # pre-processing preprocess_start_time = time.time() if not args.quantized_inputs: inputs = [inputs[0].astype(numpy.float32) / 255.0] preprocess_time = time.time() - preprocess_start_time print(f"Pre-processing time: {preprocess_time * 1000.0:.4f}ms") # inference print("Executing model") outputs, elapsed_time = engine.timed_run(inputs) print(f"Inference time: {elapsed_time * 1000.0:.4f}ms") # post-processing postprocess_start_time = time.time() outputs = postprocessor.pre_nms_postprocess(outputs) postprocess_time = time.time() - postprocess_start_time print(f"Post-processing, pre-nms time: {postprocess_time * 1000.0:.4f}ms") # NMS nms_start_time = time.time() outputs = postprocess_nms(outputs) nms_time = time.time() - nms_start_time print(f"nms time: {nms_time * 1000.0:.4f}ms") return arrays_to_bytes(outputs) @app.route("/info", methods=["GET"]) def info(): return flask.jsonify({"model_path": model_path, "engine": repr(engine)}) print("Starting Flask app") app.run(host=address, port=port, debug=False, threaded=True)
def __init__( self, model: Union[str, ModelProto], batch_size: int, num_cores: int = None, loss: Union[Callable[ [Dict[str, numpy.ndarray], Dict[str, numpy.ndarray]], Any], None] = None, ): super().__init__(model, batch_size, num_cores, loss) self._engine = compile_model(self._model, batch_size=batch_size, num_cores=num_cores) _LOGGER.debug("created model in neural magic {}".format(self._engine))
def _load_model(args) -> Tuple[Any, List[str]]: if args.engine == ORT_ENGINE and ort_error is not None: raise ort_error # validation if (args.num_cores is not None and args.engine == ORT_ENGINE and onnxruntime.__version__ < "1.7"): raise ValueError( "overriding default num_cores not supported for onnxruntime < 1.7.0. " "If using an older build with OpenMP, try setting the OMP_NUM_THREADS " "environment variable") # load model from sparsezoo if necessary if args.model_filepath.startswith("zoo:"): zoo_model = Zoo.load_model_from_stub(args.model_filepath) downloaded_path = zoo_model.onnx_file.downloaded_path() print( f"downloaded sparsezoo model {args.model_filepath} to {downloaded_path}" ) args.model_filepath = downloaded_path # scale static ONNX graph to desired image shape input_names = [] if args.engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]: args.model_filepath, input_names, _ = overwrite_transformer_onnx_model_inputs( args.model_filepath, batch_size=args.batch_size, max_length=args.max_sequence_length, ) # load model if args.engine == DEEPSPARSE_ENGINE: print(f"Compiling deepsparse model for {args.model_filepath}") model = compile_model(args.model_filepath, args.batch_size, args.num_cores) print(f"Engine info: {model}") elif args.engine == ORT_ENGINE: print(f"loading onnxruntime model for {args.model_filepath}") sess_options = onnxruntime.SessionOptions() if args.num_cores is not None: sess_options.intra_op_num_threads = args.num_cores sess_options.log_severity_level = 3 sess_options.graph_optimization_level = ( onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL) model = onnxruntime.InferenceSession(args.model_filepath, sess_options=sess_options) return model, input_names
def test_benchmark(self, model: Model, batch_size: int): """ Test the Engine.benchmark() interface """ m = model() batch = m.sample_batch(batch_size=batch_size) inputs = batch["inputs"] outputs = batch["outputs"] engine = compile_model(m, batch_size) results = engine.benchmark(inputs, include_outputs=True) for output in results.outputs: verify_outputs(output, outputs)
def main(): args = parse_args() onnx_filepath = args.onnx_filepath batch_size = args.batch_size num_cores = args.num_cores num_iterations = args.num_iterations num_warmup_iterations = args.num_warmup_iterations inputs = generate_random_inputs(onnx_filepath, batch_size) input_names = get_input_names(onnx_filepath) output_names = get_output_names(onnx_filepath) inputs_dict = {name: value for name, value in zip(input_names, inputs)} # Benchmark ONNXRuntime print("Benchmarking model with ONNXRuntime...") sess_options = onnxruntime.SessionOptions() sess_options.intra_op_num_threads = num_cores with override_onnx_batch_size(onnx_filepath, batch_size) as override_onnx_filepath: ort_network = onnxruntime.InferenceSession(override_onnx_filepath, sess_options) ort_results = BenchmarkResults() for i in range(num_warmup_iterations): ort_network.run(output_names, inputs_dict) for i in range(num_iterations): start = time.time() output = ort_network.run(output_names, inputs_dict) end = time.time() ort_results.append_batch( time_start=start, time_end=end, batch_size=batch_size, outputs=output ) # Benchmark DeepSparse Engine print("Benchmarking model with DeepSparse Engine...") dse_network = compile_model(onnx_filepath, batch_size, num_cores) dse_results = dse_network.benchmark( inputs, num_iterations, num_warmup_iterations, include_outputs=True ) for dse_output, ort_output in zip(dse_results.outputs, ort_results.outputs): verify_outputs(dse_output, ort_output) print("ONNXRuntime", ort_results) print() print("DeepSparse Engine", dse_results)
def main(): """ process arguments and run server """ _config = parse_args() # Get model print(f"Compiling model at {_config.onnx_filepath}") engine = compile_model(_config.onnx_filepath, _config.batch_size, _config.num_cores) print(engine) # Serve model run_server( predictor=engine, host=_config.address, port=_config.port, info=_config.onnx_filepath, )
def _load_model( model_filepath: str, batch_size: int, num_cores: Optional[int], engine: str, image_shape: Tuple[int, int], ): # load and return respective classification model according to arguments if (num_cores is not None and engine == ORT_ENGINE and onnxruntime.__version__ < "1.7"): raise ValueError( "overriding default num_cores not supported for onnxruntime < 1.7.0. " "If using an older build with OpenMP, try setting the OMP_NUM_THREADS " "environment variable") # scale static ONNX graph to desired image shape if engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]: model_filepath, _, image_shape = fix_onnx_input_shape( model_filepath, image_shape) # load model if engine == DEEPSPARSE_ENGINE: print(f"Compiling deepsparse model for {model_filepath}") model = compile_model(model_filepath, batch_size, num_cores) print(f"Engine info: {model}") elif engine == ORT_ENGINE: print(f"loading onnxruntime model for {model_filepath}") sess_options = onnxruntime.SessionOptions() if num_cores is not None: sess_options.intra_op_num_threads = num_cores sess_options.log_severity_level = 3 sess_options.graph_optimization_level = ( onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL) onnx_model = onnx.load(model_filepath) override_model_batch_size(onnx_model, batch_size) model = onnxruntime.InferenceSession(onnx_model.SerializeToString(), sess_options=sess_options) return model, image_shape
def main(): args = parse_args() model = fetch_model(args.model_name) batch_size = args.batch_size num_cores = args.num_cores # Gather batch of data batch = model.sample_batch(batch_size=batch_size) batched_inputs = batch["inputs"] batched_outputs = batch["outputs"] # Compile model for inference print("Compiling {} model with DeepSparse Engine".format( model.architecture_id)) engine = compile_model(model, batch_size, num_cores) print(engine) # INFERENCE # Record output from inference through the DeepSparse Engine print("Executing...") predicted_outputs = engine(batched_inputs) # Compare against reference model output verify_outputs(predicted_outputs, batched_outputs) if "labels" in batch: batched_labels = batch["labels"] # Measure accuracy against ground truth labels accuracy = calculate_top1_accuracy(predicted_outputs[-1], batched_labels[0]) print("Top-1 Accuracy for batch size {}: {:.2f}%".format( batch_size, accuracy)) # BENCHMARK # Record output from executing through the DeepSparse engine print("Benchmarking...") results = engine.benchmark(batched_inputs) print(results)
def main(): args = parse_args() if args.quiet: set_logging_level(logging.WARN) decide_thread_pinning(args.thread_pinning) scenario = parse_scenario(args.scenario.lower()) scheduler = parse_scheduler(scenario) input_shapes = parse_input_shapes(args.input_shapes) orig_model_path = args.model_path args.model_path = model_to_path(args.model_path) # Compile the ONNX into a runnable model if args.engine == DEEPSPARSE_ENGINE: model = compile_model( model=args.model_path, batch_size=args.batch_size, num_cores=args.num_cores, scheduler=scheduler, input_shapes=input_shapes, ) elif args.engine == ORT_ENGINE: model = ORTEngine( model=args.model_path, batch_size=args.batch_size, num_cores=args.num_cores, input_shapes=input_shapes, ) _LOGGER.info(model) # Generate random inputs to feed the model # TODO(mgoin): should be able to query Engine class instead of loading ONNX if input_shapes: with override_onnx_input_shapes(args.model_path, input_shapes) as model_path: input_list = generate_random_inputs(model_path, args.batch_size) else: input_list = generate_random_inputs(args.model_path, args.batch_size) if args.num_streams: _LOGGER.info("num_streams set to {}".format(args.num_streams)) elif not args.num_streams and scenario not in "singlestream": # If num_streams isn't defined, find a default args.num_streams = max(1, int(model.num_cores / 2)) _LOGGER.info("num_streams default value chosen of {}. " "This requires tuning and may be sub-optimal".format( args.num_streams)) # Benchmark _LOGGER.info( "Starting '{}' performance measurements for {} seconds".format( args.scenario, args.time)) benchmark_result = model_stream_benchmark( model, input_list, scenario=scenario, seconds_to_run=args.time, seconds_to_warmup=args.warmup_time, num_streams=args.num_streams, ) # Results summary print("Original Model Path: {}".format(orig_model_path)) print("Batch Size: {}".format(args.batch_size)) print("Scenario: {}".format(scenario)) print("Throughput (items/sec): {:.4f}".format( benchmark_result["items_per_sec"])) print("Latency Mean (ms/batch): {:.4f}".format(benchmark_result["mean"])) print("Latency Median (ms/batch): {:.4f}".format( benchmark_result["median"])) print("Latency Std (ms/batch): {:.4f}".format(benchmark_result["std"])) print("Iterations: {}".format(int(benchmark_result["iterations"]))) if args.export_path: # Export results print("Saving benchmark results to JSON file at {}".format( args.export_path)) export_dict = { "engine": str(model), "orig_model_path": orig_model_path, "model_path": args.model_path, "batch_size": args.batch_size, "input_shapes": args.input_shapes, "num_cores": args.num_cores, "scenario": args.scenario, "scheduler": str(model.scheduler), "seconds_to_run": args.time, "num_streams": args.num_streams, "benchmark_result": benchmark_result, } with open(args.export_path, "w") as out: json.dump(export_dict, out, indent=2)
def _load_model(args) -> Any: # validation if args.device not in [None, "cpu"] and args.engine != TORCH_ENGINE: raise ValueError( f"device {args.device} is not supported for {args.engine}") if args.fp16 and args.engine != TORCH_ENGINE: raise ValueError(f"half precision is not supported for {args.engine}") if args.quantized_inputs and args.engine == TORCH_ENGINE: raise ValueError(f"quantized inputs not supported for {args.engine}") if args.num_cores is not None and args.engine == TORCH_ENGINE: raise ValueError( f"overriding default num_cores not supported for {args.engine}") if (args.num_cores is not None and args.engine == ORT_ENGINE and onnxruntime.__version__ < "1.7"): raise ValueError( "overriding default num_cores not supported for onnxruntime < 1.7.0. " "If using an older build with OpenMP, try setting the OMP_NUM_THREADS " "environment variable") if args.num_sockets is not None and args.engine != DEEPSPARSE_ENGINE: raise ValueError( f"Overriding num_sockets is not supported for {args.engine}") # scale static ONNX graph to desired image shape if args.engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]: args.model_filepath, _ = modify_yolo_onnx_input_shape( args.model_filepath, args.image_shape) # load model if args.engine == DEEPSPARSE_ENGINE: print(f"Compiling deepsparse model for {args.model_filepath}") model = compile_model(args.model_filepath, args.batch_size, args.num_cores, args.num_sockets) if args.quantized_inputs and not model.cpu_vnni: print("WARNING: VNNI instructions not detected, " "quantization speedup not well supported") elif args.engine == ORT_ENGINE: print(f"loading onnxruntime model for {args.model_filepath}") sess_options = onnxruntime.SessionOptions() if args.num_cores is not None: sess_options.intra_op_num_threads = args.num_cores sess_options.log_severity_level = 3 sess_options.graph_optimization_level = ( onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL) onnx_model = onnx.load(args.model_filepath) override_model_batch_size(onnx_model, args.batch_size) model = onnxruntime.InferenceSession(onnx_model.SerializeToString(), sess_options=sess_options) elif args.engine == TORCH_ENGINE: print(f"loading torch model for {args.model_filepath}") model = torch.load(args.model_filepath) if isinstance(model, dict): model = model["model"] model.to(args.device) model.eval() if args.fp16: print("Using half precision") model.half() else: print("Using full precision") return model
def run(worker_id, args, barrier, cpu_affinity_set, results): # Set the CPU affinity of this process numas = cpus_to_numas(cpu_affinity_set) numa.memory.set_interleave_nodes(*numas) numa.schedule.run_on_nodes(*numas) # Run on given CPUs. The first argument is the process id, # and 0 means this present process numa.schedule.run_on_cpus(0, *cpu_affinity_set) # Suppress output from all but one worker process if quiet is set, # so that the user doesn't see the splash message for each process. std_out_and_err = None null_file_descriptors = None suppress_output = args.quiet and worker_id != 0 if suppress_output: null_file_descriptors = [ os.open(os.devnull, os.O_WRONLY) for x in range(2) ] std_out_and_err = os.dup(1), os.dup(2) os.dup2(null_file_descriptors[0], 1) os.dup2(null_file_descriptors[1], 2) input_shapes = parse_input_shapes(args.input_shapes) # Compile the model model = compile_model( model=args.model_path, batch_size=args.batch_size, num_cores=len(cpu_affinity_set), scheduler="single_stream", input_shapes=input_shapes, ) # Cleanly separate compilation and benchmarking barrier.wait() # Generate random inputs to feed the model # TODO(mgoin): should be able to query Engine class instead of loading ONNX if input_shapes: with override_onnx_input_shapes(args.model_path, input_shapes) as model_path: input_list = generate_random_inputs(model_path, args.batch_size) else: input_list = generate_random_inputs(args.model_path, args.batch_size) # Warm up the engine singlestream_benchmark(model, input_list, args.warmup_time) # Run the single-stream benchmark scenario and collect batch times batch_times = singlestream_benchmark(model, input_list, args.time) # Cooldown the engine singlestream_benchmark(model, input_list, args.warmup_time) if len(batch_times) == 0: raise ValueError( "Generated no batch timings, try extending benchmark time with '--time'" ) results[worker_id] = batch_times # Restore stderr and stdout if we suppressed them. if suppress_output: os.dup2(std_out_and_err[0], 1) os.dup2(std_out_and_err[1], 2) os.close(null_file_descriptors[0]) os.close(null_file_descriptors[1])