Example #1
0
def create_model_inference_app(
    model_path: str, batch_size: int, num_cores: int, address: str, port: str
) -> flask.Flask:
    print(f"Compiling model at {model_path}")
    engine = compile_model(model_path, batch_size, num_cores)
    print(engine)

    app = flask.Flask(__name__)
    CORS(app)

    @app.route("/predict", methods=["POST"])
    def predict():
        data = flask.request.get_data()

        inputs = bytes_to_tensors(data)
        print(f"Received {len(inputs)} inputs from client")

        print("Executing model")
        outputs, elapsed_time = engine.timed_run(inputs)

        print(f"Inference time took {elapsed_time * 1000.0:.4f} milliseconds")
        print(f"Produced {len(outputs)} output tensors")
        return tensors_to_bytes(outputs)

    @app.route("/info", methods=["GET"])
    def info():
        return flask.jsonify({"model_path": model_path, "engine": repr(engine)})

    print("Starting Flask app")
    app.run(host=address, port=port, debug=False, threaded=True)
Example #2
0
def main():
    args = parse_args()
    model = fetch_model(args.model_name)
    batch_size = args.batch_size
    num_cores = args.num_cores

    # Gather batch of data
    batch = model.sample_batch(batch_size=batch_size)
    batched_inputs = batch["inputs"]
    batched_outputs = batch["outputs"]

    # Compile model for inference
    print("Compiling {} model with DeepSparse Engine".format(model.architecture_id))
    engine = compile_model(model, batch_size, num_cores)
    print(engine)

    # INFERENCE
    # Record output from inference through the DeepSparse Engine
    print("Executing...")
    predicted_outputs = engine(batched_inputs)

    # Compare against reference model output
    verify_outputs(predicted_outputs, batched_outputs)

    # BENCHMARK
    # Record output from executing through the DeepSparse engine
    print("Benchmarking...")
    results = engine.benchmark(batched_inputs)
    print(results)
Example #3
0
def main():
    args = parse_args()
    onnx_filepath = args.onnx_filepath
    batch_size = args.batch_size

    inputs = generate_random_inputs(onnx_filepath, batch_size)
    input_names = get_input_names(onnx_filepath)
    output_names = get_output_names(onnx_filepath)
    inputs_dict = {name: value for name, value in zip(input_names, inputs)}

    # ONNXRuntime inference
    print("Executing model with ONNXRuntime...")
    sess_options = onnxruntime.SessionOptions()
    with override_onnx_batch_size(onnx_filepath, batch_size) as override_onnx_filepath:
        ort_network = onnxruntime.InferenceSession(override_onnx_filepath, sess_options)

        ort_outputs = ort_network.run(output_names, inputs_dict)

    # DeepSparse Engine inference
    print("Executing model with DeepSparse Engine...")
    dse_network = compile_model(onnx_filepath, batch_size=batch_size)
    dse_outputs = dse_network(inputs)

    verify_outputs(dse_outputs, ort_outputs)

    print("DeepSparse Engine output matches ONNXRuntime output")
Example #4
0
    def test_engine(self, model: Model, batch_size: int):
        """
        Test the Engine.inference interfaces
        """
        m = model()
        batch = m.sample_batch(batch_size=batch_size)
        inputs = batch["inputs"]
        outputs = batch["outputs"]

        print("compile model")
        engine = compile_model(m, batch_size)

        print("engine callable")
        pred_outputs = engine(inputs)
        verify_outputs(pred_outputs, outputs)

        print("engine run")
        pred_outputs = engine.run(inputs)
        verify_outputs(pred_outputs, outputs)

        print("engine mapped_run")
        pred_outputs = engine.mapped_run(inputs)
        assert len(pred_outputs) == len(outputs)

        print("engine timed_run")
        pred_outputs, elapsed = engine.timed_run(inputs)
        verify_outputs(pred_outputs, outputs)
Example #5
0
def engine_flask_server(
    model_path: str,
    batch_size: int = 1,
    num_cores: int = None,
    scheduler: Scheduler = Scheduler.multi_stream,
    address: str = "0.0.0.0",
    port: str = "5543",
) -> flask.Flask:
    """

    :param model_path: Either a path to the model's onnx file, a SparseZoo model stub
        prefixed by 'zoo:', a SparseZoo Model object, or a SparseZoo ONNX File
        object that defines the neural network
    :param batch_size: The batch size of the inputs to be used with the model
    :param num_cores: The number of physical cores to run the model on.
        Pass None or 0 to run on the max number of cores
        in one socket for the current machine, default None
    :param scheduler: The kind of scheduler to execute with. Defaults to multi_stream
    :param address: IP address to run on. Default is 0.0.0.0
    :param port: port to run on. Default is 5543
    :return: launches a flask server on the given address and port can run the
        given model on the DeepSparse engine via HTTP requests
    """
    _LOGGER.info(f"Compiling model at {model_path}")
    engine = compile_model(model_path,
                           batch_size,
                           num_cores,
                           scheduler=scheduler)
    _LOGGER.info(engine)

    app = flask.Flask(__name__)
    CORS(app)

    @app.route("/run", methods=["POST"])
    def run():
        data = flask.request.get_data()

        inputs = bytes_to_arrays(data)
        _LOGGER.info(f"Received {len(inputs)} inputs from client")

        _LOGGER.info("Executing model")
        outputs, elapsed_time = engine.timed_run(inputs)

        _LOGGER.info(
            f"Inference time took {elapsed_time * 1000.0:.4f} milliseconds")
        _LOGGER.info(f"Produced {len(outputs)} output tensors")
        return arrays_to_bytes(outputs)

    @app.route("/info", methods=["GET"])
    def info():
        return flask.jsonify({
            "model_path": model_path,
            "engine": repr(engine)
        })

    _LOGGER.info("Starting Flask app")
    app.run(host=address, port=port, debug=False, threaded=True)
Example #6
0
def create_and_run_model_server(
    args, model_path: str, batch_size: int, num_cores: int, address: str, port: str
) -> flask.Flask:
    print(f"Compiling model at {model_path}")
    engine = compile_model(model_path, batch_size, num_cores)
    print(engine)

    postprocessor = YoloPostprocessor()

    app = flask.Flask(__name__)
    CORS(app)

    @app.route("/predict", methods=["POST"])
    def predict():
        # load raw images
        raw_data = flask.request.get_data()
        inputs = bytes_to_arrays(raw_data)
        print(f"Received {len(inputs)} images from client")

        # pre-processing
        preprocess_start_time = time.time()
        if not args.quantized_inputs:
            inputs = [inputs[0].astype(numpy.float32) / 255.0]
        preprocess_time = time.time() - preprocess_start_time
        print(f"Pre-processing time: {preprocess_time * 1000.0:.4f}ms")

        # inference
        print("Executing model")
        outputs, elapsed_time = engine.timed_run(inputs)
        print(f"Inference time: {elapsed_time * 1000.0:.4f}ms")

        # post-processing
        postprocess_start_time = time.time()
        outputs = postprocessor.pre_nms_postprocess(outputs)
        postprocess_time = time.time() - postprocess_start_time
        print(f"Post-processing, pre-nms time: {postprocess_time * 1000.0:.4f}ms")

        # NMS
        nms_start_time = time.time()
        outputs = postprocess_nms(outputs)
        nms_time = time.time() - nms_start_time
        print(f"nms time: {nms_time * 1000.0:.4f}ms")

        return arrays_to_bytes(outputs)

    @app.route("/info", methods=["GET"])
    def info():
        return flask.jsonify({"model_path": model_path, "engine": repr(engine)})

    print("Starting Flask app")
    app.run(host=address, port=port, debug=False, threaded=True)
Example #7
0
 def __init__(
     self,
     model: Union[str, ModelProto],
     batch_size: int,
     num_cores: int = None,
     loss: Union[Callable[
         [Dict[str, numpy.ndarray], Dict[str, numpy.ndarray]], Any],
                 None] = None,
 ):
     super().__init__(model, batch_size, num_cores, loss)
     self._engine = compile_model(self._model,
                                  batch_size=batch_size,
                                  num_cores=num_cores)
     _LOGGER.debug("created model in neural magic {}".format(self._engine))
Example #8
0
def _load_model(args) -> Tuple[Any, List[str]]:
    if args.engine == ORT_ENGINE and ort_error is not None:
        raise ort_error

    # validation
    if (args.num_cores is not None and args.engine == ORT_ENGINE
            and onnxruntime.__version__ < "1.7"):
        raise ValueError(
            "overriding default num_cores not supported for onnxruntime < 1.7.0. "
            "If using an older build with OpenMP, try setting the OMP_NUM_THREADS "
            "environment variable")

    # load model from sparsezoo if necessary
    if args.model_filepath.startswith("zoo:"):
        zoo_model = Zoo.load_model_from_stub(args.model_filepath)
        downloaded_path = zoo_model.onnx_file.downloaded_path()
        print(
            f"downloaded sparsezoo model {args.model_filepath} to {downloaded_path}"
        )
        args.model_filepath = downloaded_path

    # scale static ONNX graph to desired image shape
    input_names = []
    if args.engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
        args.model_filepath, input_names, _ = overwrite_transformer_onnx_model_inputs(
            args.model_filepath,
            batch_size=args.batch_size,
            max_length=args.max_sequence_length,
        )

    # load model
    if args.engine == DEEPSPARSE_ENGINE:
        print(f"Compiling deepsparse model for {args.model_filepath}")
        model = compile_model(args.model_filepath, args.batch_size,
                              args.num_cores)
        print(f"Engine info: {model}")
    elif args.engine == ORT_ENGINE:
        print(f"loading onnxruntime model for {args.model_filepath}")

        sess_options = onnxruntime.SessionOptions()
        if args.num_cores is not None:
            sess_options.intra_op_num_threads = args.num_cores
        sess_options.log_severity_level = 3
        sess_options.graph_optimization_level = (
            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL)
        model = onnxruntime.InferenceSession(args.model_filepath,
                                             sess_options=sess_options)

    return model, input_names
Example #9
0
    def test_benchmark(self, model: Model, batch_size: int):
        """
        Test the Engine.benchmark() interface
        """

        m = model()
        batch = m.sample_batch(batch_size=batch_size)
        inputs = batch["inputs"]
        outputs = batch["outputs"]

        engine = compile_model(m, batch_size)
        results = engine.benchmark(inputs, include_outputs=True)

        for output in results.outputs:
            verify_outputs(output, outputs)
Example #10
0
def main():
    args = parse_args()
    onnx_filepath = args.onnx_filepath
    batch_size = args.batch_size
    num_cores = args.num_cores
    num_iterations = args.num_iterations
    num_warmup_iterations = args.num_warmup_iterations

    inputs = generate_random_inputs(onnx_filepath, batch_size)
    input_names = get_input_names(onnx_filepath)
    output_names = get_output_names(onnx_filepath)
    inputs_dict = {name: value for name, value in zip(input_names, inputs)}

    # Benchmark ONNXRuntime
    print("Benchmarking model with ONNXRuntime...")
    sess_options = onnxruntime.SessionOptions()
    sess_options.intra_op_num_threads = num_cores
    with override_onnx_batch_size(onnx_filepath, batch_size) as override_onnx_filepath:
        ort_network = onnxruntime.InferenceSession(override_onnx_filepath, sess_options)

        ort_results = BenchmarkResults()
        for i in range(num_warmup_iterations):
            ort_network.run(output_names, inputs_dict)
        for i in range(num_iterations):
            start = time.time()
            output = ort_network.run(output_names, inputs_dict)
            end = time.time()
            ort_results.append_batch(
                time_start=start, time_end=end, batch_size=batch_size, outputs=output
            )

    # Benchmark DeepSparse Engine
    print("Benchmarking model with DeepSparse Engine...")
    dse_network = compile_model(onnx_filepath, batch_size, num_cores)
    dse_results = dse_network.benchmark(
        inputs, num_iterations, num_warmup_iterations, include_outputs=True
    )

    for dse_output, ort_output in zip(dse_results.outputs, ort_results.outputs):
        verify_outputs(dse_output, ort_output)

    print("ONNXRuntime", ort_results)
    print()
    print("DeepSparse Engine", dse_results)
Example #11
0
def main():
    """
    process arguments and run server
    """
    _config = parse_args()
    # Get model
    print(f"Compiling model at {_config.onnx_filepath}")
    engine = compile_model(_config.onnx_filepath, _config.batch_size,
                           _config.num_cores)

    print(engine)

    # Serve model
    run_server(
        predictor=engine,
        host=_config.address,
        port=_config.port,
        info=_config.onnx_filepath,
    )
Example #12
0
def _load_model(
    model_filepath: str,
    batch_size: int,
    num_cores: Optional[int],
    engine: str,
    image_shape: Tuple[int, int],
):
    # load and return respective classification model according to arguments

    if (num_cores is not None and engine == ORT_ENGINE
            and onnxruntime.__version__ < "1.7"):
        raise ValueError(
            "overriding default num_cores not supported for onnxruntime < 1.7.0. "
            "If using an older build with OpenMP, try setting the OMP_NUM_THREADS "
            "environment variable")

    # scale static ONNX graph to desired image shape
    if engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
        model_filepath, _, image_shape = fix_onnx_input_shape(
            model_filepath, image_shape)

    # load model
    if engine == DEEPSPARSE_ENGINE:
        print(f"Compiling deepsparse model for {model_filepath}")
        model = compile_model(model_filepath, batch_size, num_cores)
        print(f"Engine info: {model}")

    elif engine == ORT_ENGINE:
        print(f"loading onnxruntime model for {model_filepath}")

        sess_options = onnxruntime.SessionOptions()
        if num_cores is not None:
            sess_options.intra_op_num_threads = num_cores
        sess_options.log_severity_level = 3
        sess_options.graph_optimization_level = (
            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL)

        onnx_model = onnx.load(model_filepath)
        override_model_batch_size(onnx_model, batch_size)
        model = onnxruntime.InferenceSession(onnx_model.SerializeToString(),
                                             sess_options=sess_options)

    return model, image_shape
Example #13
0
def main():
    args = parse_args()
    model = fetch_model(args.model_name)
    batch_size = args.batch_size
    num_cores = args.num_cores

    # Gather batch of data
    batch = model.sample_batch(batch_size=batch_size)
    batched_inputs = batch["inputs"]
    batched_outputs = batch["outputs"]

    # Compile model for inference
    print("Compiling {} model with DeepSparse Engine".format(
        model.architecture_id))
    engine = compile_model(model, batch_size, num_cores)
    print(engine)

    # INFERENCE
    # Record output from inference through the DeepSparse Engine
    print("Executing...")
    predicted_outputs = engine(batched_inputs)

    # Compare against reference model output
    verify_outputs(predicted_outputs, batched_outputs)

    if "labels" in batch:
        batched_labels = batch["labels"]
        # Measure accuracy against ground truth labels
        accuracy = calculate_top1_accuracy(predicted_outputs[-1],
                                           batched_labels[0])
        print("Top-1 Accuracy for batch size {}: {:.2f}%".format(
            batch_size, accuracy))

    # BENCHMARK
    # Record output from executing through the DeepSparse engine
    print("Benchmarking...")
    results = engine.benchmark(batched_inputs)
    print(results)
Example #14
0
def main():

    args = parse_args()

    if args.quiet:
        set_logging_level(logging.WARN)

    decide_thread_pinning(args.thread_pinning)

    scenario = parse_scenario(args.scenario.lower())
    scheduler = parse_scheduler(scenario)
    input_shapes = parse_input_shapes(args.input_shapes)

    orig_model_path = args.model_path
    args.model_path = model_to_path(args.model_path)

    # Compile the ONNX into a runnable model
    if args.engine == DEEPSPARSE_ENGINE:
        model = compile_model(
            model=args.model_path,
            batch_size=args.batch_size,
            num_cores=args.num_cores,
            scheduler=scheduler,
            input_shapes=input_shapes,
        )
    elif args.engine == ORT_ENGINE:
        model = ORTEngine(
            model=args.model_path,
            batch_size=args.batch_size,
            num_cores=args.num_cores,
            input_shapes=input_shapes,
        )
    _LOGGER.info(model)

    # Generate random inputs to feed the model
    # TODO(mgoin): should be able to query Engine class instead of loading ONNX
    if input_shapes:
        with override_onnx_input_shapes(args.model_path,
                                        input_shapes) as model_path:
            input_list = generate_random_inputs(model_path, args.batch_size)
    else:
        input_list = generate_random_inputs(args.model_path, args.batch_size)

    if args.num_streams:
        _LOGGER.info("num_streams set to {}".format(args.num_streams))
    elif not args.num_streams and scenario not in "singlestream":
        # If num_streams isn't defined, find a default
        args.num_streams = max(1, int(model.num_cores / 2))
        _LOGGER.info("num_streams default value chosen of {}. "
                     "This requires tuning and may be sub-optimal".format(
                         args.num_streams))

    # Benchmark
    _LOGGER.info(
        "Starting '{}' performance measurements for {} seconds".format(
            args.scenario, args.time))
    benchmark_result = model_stream_benchmark(
        model,
        input_list,
        scenario=scenario,
        seconds_to_run=args.time,
        seconds_to_warmup=args.warmup_time,
        num_streams=args.num_streams,
    )

    # Results summary
    print("Original Model Path: {}".format(orig_model_path))
    print("Batch Size: {}".format(args.batch_size))
    print("Scenario: {}".format(scenario))
    print("Throughput (items/sec): {:.4f}".format(
        benchmark_result["items_per_sec"]))
    print("Latency Mean (ms/batch): {:.4f}".format(benchmark_result["mean"]))
    print("Latency Median (ms/batch): {:.4f}".format(
        benchmark_result["median"]))
    print("Latency Std (ms/batch): {:.4f}".format(benchmark_result["std"]))
    print("Iterations: {}".format(int(benchmark_result["iterations"])))

    if args.export_path:
        # Export results
        print("Saving benchmark results to JSON file at {}".format(
            args.export_path))
        export_dict = {
            "engine": str(model),
            "orig_model_path": orig_model_path,
            "model_path": args.model_path,
            "batch_size": args.batch_size,
            "input_shapes": args.input_shapes,
            "num_cores": args.num_cores,
            "scenario": args.scenario,
            "scheduler": str(model.scheduler),
            "seconds_to_run": args.time,
            "num_streams": args.num_streams,
            "benchmark_result": benchmark_result,
        }
        with open(args.export_path, "w") as out:
            json.dump(export_dict, out, indent=2)
Example #15
0
def _load_model(args) -> Any:
    # validation
    if args.device not in [None, "cpu"] and args.engine != TORCH_ENGINE:
        raise ValueError(
            f"device {args.device} is not supported for {args.engine}")
    if args.fp16 and args.engine != TORCH_ENGINE:
        raise ValueError(f"half precision is not supported for {args.engine}")
    if args.quantized_inputs and args.engine == TORCH_ENGINE:
        raise ValueError(f"quantized inputs not supported for {args.engine}")
    if args.num_cores is not None and args.engine == TORCH_ENGINE:
        raise ValueError(
            f"overriding default num_cores not supported for {args.engine}")
    if (args.num_cores is not None and args.engine == ORT_ENGINE
            and onnxruntime.__version__ < "1.7"):
        raise ValueError(
            "overriding default num_cores not supported for onnxruntime < 1.7.0. "
            "If using an older build with OpenMP, try setting the OMP_NUM_THREADS "
            "environment variable")
    if args.num_sockets is not None and args.engine != DEEPSPARSE_ENGINE:
        raise ValueError(
            f"Overriding num_sockets is not supported for {args.engine}")

    # scale static ONNX graph to desired image shape
    if args.engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
        args.model_filepath, _ = modify_yolo_onnx_input_shape(
            args.model_filepath, args.image_shape)

    # load model
    if args.engine == DEEPSPARSE_ENGINE:
        print(f"Compiling deepsparse model for {args.model_filepath}")
        model = compile_model(args.model_filepath, args.batch_size,
                              args.num_cores, args.num_sockets)
        if args.quantized_inputs and not model.cpu_vnni:
            print("WARNING: VNNI instructions not detected, "
                  "quantization speedup not well supported")
    elif args.engine == ORT_ENGINE:
        print(f"loading onnxruntime model for {args.model_filepath}")

        sess_options = onnxruntime.SessionOptions()
        if args.num_cores is not None:
            sess_options.intra_op_num_threads = args.num_cores
        sess_options.log_severity_level = 3
        sess_options.graph_optimization_level = (
            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL)

        onnx_model = onnx.load(args.model_filepath)
        override_model_batch_size(onnx_model, args.batch_size)
        model = onnxruntime.InferenceSession(onnx_model.SerializeToString(),
                                             sess_options=sess_options)
    elif args.engine == TORCH_ENGINE:
        print(f"loading torch model for {args.model_filepath}")
        model = torch.load(args.model_filepath)
        if isinstance(model, dict):
            model = model["model"]
        model.to(args.device)
        model.eval()
        if args.fp16:
            print("Using half precision")
            model.half()
        else:
            print("Using full precision")
    return model
Example #16
0
def run(worker_id, args, barrier, cpu_affinity_set, results):
    # Set the CPU affinity of this process
    numas = cpus_to_numas(cpu_affinity_set)
    numa.memory.set_interleave_nodes(*numas)
    numa.schedule.run_on_nodes(*numas)

    # Run on given CPUs. The first argument is the process id,
    # and 0 means this present process
    numa.schedule.run_on_cpus(0, *cpu_affinity_set)

    # Suppress output from all but one worker process if quiet is set,
    # so that the user doesn't see the splash message for each process.
    std_out_and_err = None
    null_file_descriptors = None
    suppress_output = args.quiet and worker_id != 0
    if suppress_output:
        null_file_descriptors = [
            os.open(os.devnull, os.O_WRONLY) for x in range(2)
        ]
        std_out_and_err = os.dup(1), os.dup(2)
        os.dup2(null_file_descriptors[0], 1)
        os.dup2(null_file_descriptors[1], 2)

    input_shapes = parse_input_shapes(args.input_shapes)

    # Compile the model
    model = compile_model(
        model=args.model_path,
        batch_size=args.batch_size,
        num_cores=len(cpu_affinity_set),
        scheduler="single_stream",
        input_shapes=input_shapes,
    )

    # Cleanly separate compilation and benchmarking
    barrier.wait()

    # Generate random inputs to feed the model
    # TODO(mgoin): should be able to query Engine class instead of loading ONNX
    if input_shapes:
        with override_onnx_input_shapes(args.model_path,
                                        input_shapes) as model_path:
            input_list = generate_random_inputs(model_path, args.batch_size)
    else:
        input_list = generate_random_inputs(args.model_path, args.batch_size)

    # Warm up the engine
    singlestream_benchmark(model, input_list, args.warmup_time)

    # Run the single-stream benchmark scenario and collect batch times
    batch_times = singlestream_benchmark(model, input_list, args.time)

    # Cooldown the engine
    singlestream_benchmark(model, input_list, args.warmup_time)

    if len(batch_times) == 0:
        raise ValueError(
            "Generated no batch timings, try extending benchmark time with '--time'"
        )

    results[worker_id] = batch_times

    # Restore stderr and stdout if we suppressed them.
    if suppress_output:
        os.dup2(std_out_and_err[0], 1)
        os.dup2(std_out_and_err[1], 2)

        os.close(null_file_descriptors[0])
        os.close(null_file_descriptors[1])