Example #1
0
def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
    if cache_dir and not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    import onnxruntime

    if use_gpu:
        if provider == "dml":
            assert (
                "DmlExecutionProvider"
                in onnxruntime.get_available_providers()
            ), "Please install onnxruntime-directml package to test GPU inference."

        else:
            assert (
                "CUDAExecutionProvider"
                in onnxruntime.get_available_providers()
            ), "Please install onnxruntime-gpu package to test GPU inference."

    import transformers

    logger.info(f"PyTorch Version:{torch.__version__}")
    logger.info(f"Transformers Version:{transformers.__version__}")
    logger.info(f"Onnxruntime Version:{onnxruntime.__version__}")

    # Support three major versions of PyTorch and OnnxRuntime, and up to 6 months of transformers.
    from packaging import version

    assert version.parse(torch.__version__) >= version.parse("1.5.0")
    assert version.parse(transformers.__version__) >= version.parse("3.0.0")
    assert version.parse(onnxruntime.__version__) >= version.parse("1.4.0")
Example #2
0
def convert_onnx_models_to_ort():
    args = parse_args()

    model_path_or_dir = args.model_path_or_dir.resolve()
    custom_op_library = args.custom_op_library.resolve() if args.custom_op_library else None

    if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file():
        raise FileNotFoundError("Model path '{}' is not a file or directory.".format(model_path_or_dir))

    if custom_op_library and not custom_op_library.is_file():
        raise FileNotFoundError("Unable to find custom operator library '{}'".format(custom_op_library))

    if args.use_nnapi and 'NnapiExecutionProvider' not in ort.get_available_providers():
        raise ValueError('The NNAPI Execution Provider was not included in this build of ONNX Runtime.')

    if args.use_coreml and 'CoreMLExecutionProvider' not in ort.get_available_providers():
        raise ValueError('The CoreML Execution Provider was not included in this build of ONNX Runtime.')

    session_options_config_entries = {}

    if args.nnapi_partitioning_stop_ops is not None:
        session_options_config_entries["ep.nnapi.partitioning_stop_ops"] = args.nnapi_partitioning_stop_ops

    if args.target_platform == 'arm':
        session_options_config_entries["session.qdqisint8allowed"] = "1"
    else:
        session_options_config_entries["session.qdqisint8allowed"] = "0"

    for optimization_level in args.optimization_level:
        print(f"Converting models and creating configuration file for optimization level '{optimization_level}'")
        _convert(model_path_or_dir, optimization_level, args.use_nnapi, args.use_coreml, custom_op_library,
                 args.save_optimized_onnx_model, args.allow_conversion_failures, args.target_platform,
                 session_options_config_entries)

        _create_config_file_from_ort_models(model_path_or_dir, optimization_level, args.enable_type_reduction)
Example #3
0
def main():
    args = parse_arguments()

    if args.test_times == 0:
        args.test_times = max(1, int(1000 / args.samples))

    if args.use_gpu and ('CUDAExecutionProvider'
                         not in onnxruntime.get_available_providers()):
        print(
            "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
        return
    elif (not args.use_gpu) and ('CUDAExecutionProvider'
                                 in onnxruntime.get_available_providers()):
        print(
            "Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance."
        )

    average_latency = {}
    contiguous_latency = run_performance(average_latency, args.model,
                                         args.batch_size, args.sequence_length,
                                         args.use_gpu, args.samples,
                                         args.test_times, args.seed,
                                         args.verbose, args.all)

    if average_latency is None:
        return

    summary_file = os.path.join(
        Path(args.model).parent, "perf_results_{}_B{}_S{}_{}.txt".format(
            'GPU' if args.use_gpu else 'CPU', args.batch_size,
            args.sequence_length,
            datetime.now().strftime("%Y%m%d-%H%M%S")))
    with open(summary_file, 'w+', newline='') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
        headers = None
        for key, latency in average_latency.items():
            params = key.split(',')
            if headers is None:
                headers = ["Latency(ms)", "Throughput(QPS)"]
                headers.extend([x.split('=')[0] for x in params])
                tsv_writer.writerow(headers)

            # include the extra latency of array conversion if required.
            if args.inclusive and 'contiguous=True' in params:
                latency += contiguous_latency

            throughput = args.batch_size * (1000 / latency)
            values = [format(latency, '.2f'), format(throughput, '.2f')]

            values.extend([x.split('=')[1] for x in params])
            tsv_writer.writerow(values)

    print("Test summary is saved to", summary_file)
Example #4
0
def create_session(model_path,
                   use_gpu,
                   intra_op_num_threads,
                   graph_optimization_level=None):
    # Import onnxruntime shall be after OpenMP environment variable setting.
    # So we put the import in function to delay importing instead of top of this script.
    import onnxruntime

    if use_gpu and ('CUDAExecutionProvider'
                    not in onnxruntime.get_available_providers()):
        print(
            "Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
    elif (not use_gpu) and ('CUDAExecutionProvider'
                            in onnxruntime.get_available_providers()):
        print(
            "Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance."
        )

    if intra_op_num_threads is None and graph_optimization_level is None:
        session = onnxruntime.InferenceSession(model_path)
    else:
        execution_providers = ['CPUExecutionProvider'] if not use_gpu else [
            'CUDAExecutionProvider', 'CPUExecutionProvider'
        ]

        sess_options = onnxruntime.SessionOptions()
        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL

        if graph_optimization_level is None:
            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
        elif graph_optimization_level == 0:
            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
        elif graph_optimization_level == 1:
            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
        elif graph_optimization_level == 2:
            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
        elif graph_optimization_level == 99:
            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = graph_optimization_level

        if intra_op_num_threads is not None:
            sess_options.intra_op_num_threads = intra_op_num_threads

        session = onnxruntime.InferenceSession(model_path,
                                               sess_options,
                                               providers=execution_providers)

    if use_gpu:
        assert 'CUDAExecutionProvider' in session.get_providers()
    return session
Example #5
0
    def testRunContribSparseMatMul(self):
        '''
        Mutliple sparse COO tensor to dense
        '''
        common_shape = [9,9] # inputs and oputputs same shape
        A_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0,
                    10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
                    18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
                    26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0,
                    34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0,
                    42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0,
                    50.0, 51.0, 52.0,  53.0], np.float32)
        # 2-D index
        A_indices = np.array([0, 1, 0, 2, 0, 6, 0, 7, 0, 8, 1, 0, 1,
                                1, 1, 2, 1, 6, 1, 7, 1, 8, 2, 0, 2, 1,
                                2, 2, 2, 6, 2, 7, 2, 8, 3, 3, 3, 4, 3,
                                5, 3, 6, 3, 7, 3, 8, 4, 3, 4, 4, 4, 5,
                                4, 6, 4, 7, 4, 8, 5, 3, 5, 4, 5, 5, 5,
                                6, 5, 7, 5, 8, 6, 0, 6, 1, 6, 2, 6, 3,
                                6, 4, 6, 5, 7, 0, 7, 1, 7, 2, 7, 3, 7,
                                4, 7, 5, 8, 0, 8, 1, 8, 2, 8, 3, 8, 4,
                                8, 5], np.int64).reshape((len(A_values), 2))

        cpu_device = onnxrt.OrtDevice.make('cpu', 0)
        sparse_tensor = onnxrt.SparseTensor.sparse_coo_from_numpy(common_shape, A_values, A_indices, cpu_device)
        A_ort_value = onnxrt.OrtValue.ort_value_from_sparse_tensor(sparse_tensor)

        B_data = np.array([0, 1, 2, 0, 0, 0, 3, 4, 5,
                            6, 7, 8, 0, 0, 0, 9, 10, 11,
                            12, 13, 14, 0, 0, 0, 15, 16, 17,
                            0, 0, 0, 18, 19, 20, 21, 22, 23,
                            0, 0, 0, 24, 25, 26, 27, 28, 29,
                            0, 0, 0, 30, 31, 32, 33, 34, 35,
                            36, 37, 38, 39, 40, 41, 0, 0, 0,
                            42, 43, 44, 45, 46, 47, 0, 0, 0,
                            48, 49, 50, 51, 52, 53, 0, 0, 0], np.float32).reshape(common_shape)
        B_ort_value = onnxrt.OrtValue.ortvalue_from_numpy(B_data)

        Y_result = np.array([546, 561, 576, 552, 564, 576, 39, 42, 45,
                            1410, 1461, 1512, 1362, 1392, 1422, 201, 222, 243,
                            2274, 2361, 2448, 2172, 2220, 2268, 363, 402, 441,
                            2784, 2850, 2916, 4362, 4485, 4608, 1551, 1608, 1665,
                            3540, 3624, 3708, 5604, 5763, 5922, 2037, 2112, 2187,
                            4296, 4398, 4500, 6846, 7041, 7236, 2523, 2616, 2709,
                            678, 789, 900, 2892, 3012, 3132, 4263, 4494, 4725,
                            786, 915, 1044, 3324, 3462, 3600, 4911, 5178, 5445,
                            894, 1041, 1188, 3756, 3912, 4068, 5559, 5862, 6165], np.float).reshape(common_shape)

        sess = onnxrt.InferenceSession(get_name("sparse_to_dense_matmul.onnx"),
                                       providers=onnxrt.get_available_providers())
        res = sess.run_with_ort_values(["dense_Y"], { "sparse_A" : A_ort_value, "dense_B" : B_ort_value })
        self.assertEqual(len(res), 1)
        ort_value = res[0]
        self.assertTrue(isinstance(ort_value, onnxrt.OrtValue))
        self.assertTrue(ort_value.is_tensor())
        self.assertEqual(ort_value.data_type(), "tensor(float)")
        self.assertEqual(ort_value.shape(), common_shape)
        result = ort_value.numpy()
        self.assertEqual(list(result.shape), common_shape)
        self.assertTrue(np.array_equal(Y_result, result))
Example #6
0
    def testZipMapStringFloat(self):
        sess = onnxrt.InferenceSession(get_name("zipmap_stringfloat.onnx"),
                                       providers=onnxrt.get_available_providers())
        x = np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], dtype=np.float32).reshape((2, 3))

        x_name = sess.get_inputs()[0].name
        self.assertEqual(x_name, "X")
        x_type = sess.get_inputs()[0].type
        self.assertEqual(x_type, 'tensor(float)')

        output_name = sess.get_outputs()[0].name
        self.assertEqual(output_name, "Z")
        output_type = sess.get_outputs()[0].type
        self.assertEqual(output_type, 'seq(map(string,tensor(float)))')

        output_expected = [{
            'class2': 0.0,
            'class1': 1.0,
            'class3': 3.0
        }, {
            'class2': 23.0,
            'class1': 44.0,
            'class3': 11.0
        }]
        res = sess.run([output_name], {x_name: x})
        self.assertEqual(output_expected, res[0])
Example #7
0
def ort_session(model_name: str) -> ort.InferenceSession:
    if model_name == "u2netp":
        md5 = "8e83ca70e441ab06c318d82300c84806"
        url = "https://drive.google.com/uc?id=1tNuFmLv0TSNDjYIkjEdeH1IWKQdUA4HR"
    elif model_name == "u2net":
        md5 = "60024c5c889badc19c04ad937298a77b"
        url = "https://drive.google.com/uc?id=1tCU5MM1LhRgGou5OpmpjBQbSrYIUoYab"
    elif model_name == "u2net_human_seg":
        md5 = "c09ddc2e0104f800e3e1bb4652583d1f"
        url = "https://drive.google.com/uc?id=1ZfqwVxu-1XWC1xU1GHIP-FM_Knd_AX5j"
    else:
        assert AssertionError(
            "Choose between u2net, u2netp or u2net_human_seg")

    home = os.getenv("U2NET_HOME", os.path.join("~", ".u2net"))
    path = Path(home).expanduser() / f"{model_name}.onnx"
    path.parents[0].mkdir(parents=True, exist_ok=True)

    if not (path.exists()
            and hashlib.md5(path.read_bytes()).hexdigest() == md5):
        with redirect_stdout(sys.stderr):
            gdown.download(url, str(path), use_cookies=False)

    return ort.InferenceSession(str(path),
                                providers=ort.get_available_providers())
    def run_test(
        self,
        model,
        input=None,
        custom_opsets=None,
        batch_size=2,
        rtol=0.001,
        atol=1e-7,
        do_constant_folding=True,
        dynamic_axes=None,
        test_with_inputs=None,
        input_names=None,
        output_names=None,
    ):
        model.eval()

        if input is None:
            input = torch.randn(batch_size, 3, 224, 224, requires_grad=True)

        with torch.no_grad():
            if isinstance(input, torch.Tensor):
                input = (input,)
            # In-place operators will update input tensor data as well.
            # Thus inputs are replicated before every forward call.
            input_copy = copy.deepcopy(input)
            output = model(*input_copy)
            if isinstance(output, torch.Tensor):
                output = (output,)

            # export the model to ONNX
            f = io.BytesIO()
            torch.onnx.export(
                model,
                input_copy,
                f,
                opset_version=self.opset_version,
                do_constant_folding=do_constant_folding,
                keep_initializers_as_inputs=self.keep_initializers_as_inputs,
                dynamic_axes=dynamic_axes,
                input_names=input_names,
                output_names=output_names,
                custom_opsets=custom_opsets,
            )

            # compute onnxruntime output prediction
            ort_sess = onnxruntime.InferenceSession(f.getvalue(), providers=onnxruntime.get_available_providers())
            input_copy = copy.deepcopy(input)
            ort_test_with_input(ort_sess, input_copy, output, rtol, atol)

            # if additional test inputs are provided run the onnx
            # model with these inputs and check the outputs
            if test_with_inputs is not None:
                for test_input in test_with_inputs:
                    if isinstance(test_input, torch.Tensor):
                        test_input = (test_input,)
                    test_input_copy = copy.deepcopy(test_input)
                    output = model(*test_input_copy)
                    if isinstance(output, torch.Tensor):
                        output = (output,)
                    ort_test_with_input(ort_sess, test_input, output, rtol, atol)
Example #9
0
def run_onnxruntime(onnx_model_path, use_gpu, optimized_model_path=None):
    if use_gpu and 'CUDAExecutionProvider' not in onnxruntime.get_available_providers(
    ):
        logger.error("There is no gpu for onnxruntime to do optimization.")

    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

    if optimized_model_path is None:
        path_prefix = onnx_model_path[:-5]  #remove .onnx suffix
        optimized_model_path = "{}_ort_{}.onnx".format(
            path_prefix, "gpu" if use_gpu else "cpu")

    sess_options.optimized_model_filepath = optimized_model_path

    if not use_gpu:
        session = onnxruntime.InferenceSession(
            onnx_model_path, sess_options, providers=['CPUExecutionProvider'])
    else:
        session = onnxruntime.InferenceSession(onnx_model_path, sess_options)
        assert 'CUDAExecutionProvider' in session.get_providers(
        )  # Make sure there is GPU

    assert os.path.exists(optimized_model_path) and os.path.isfile(
        optimized_model_path)
    logger.info("Save optimized model by onnxruntime to {}".format(
        optimized_model_path))
    return optimized_model_path
Example #10
0
def optimize_by_model_type(optimization_config, model_type=None):
    middle_path = os.path.join(tempfile.mkdtemp(), 'middle_optimized.onnx')
    cmd = "python -m onnxruntime.transformers.optimizer --input %s --output %s " % (optimization_config.model_path, middle_path)
    if optimization_config.transformer_args:
        cmd += optimization_config.transformer_args
    if model_type:
        cmd += " --model_type {}".format(model_type)
    logger.info("Running TransformersOptimizer with command {}".format(cmd))

    if OLIVE_LOG_LEVEL == "INFO":
        ret = subprocess.run(cmd, shell=True)
    elif OLIVE_LOG_LEVEL == "WARNING":
        ret = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
    else:
        ret = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    if ret.returncode == 0:
        try:
            default_ep = "CUDAExecutionProvider" if "CUDAExecutionProvider" in ort.get_available_providers() else "CPUExecutionProvider"
            ort.InferenceSession(middle_path, providers=[default_ep])
            logger.info("Transformers optimization finished with success")
            copy(middle_path, optimization_config.model_path)
        except Exception:
            logger.info("Invalid model after transformer optimization. Original model will be used.")
    else:
        logger.info("Transformers optimization failed. Original model will be used for optimization.")
Example #11
0
 def _get_default_providers(
     gpu_device_id: int,
     disable_copy_in_default_stream: bool,
 ) -> "_ProviderType":
     if gpu_device_id != -1:
         _, free = get_gpu_memory(gpu_device_id)
         gpu_ = {
             "device_id": gpu_device_id,
             "arena_extend_strategy": "kNextPowerOfTwo",
             "gpu_mem_limit": free,
             "cudnn_conv_algo_search": "EXHAUSTIVE",
             "do_copy_in_default_stream": True,
         }
         if disable_copy_in_default_stream:
             logger.warning(
                 "`disable_copy_in_default_stream=True` will set `do_copy_in_default_stream=False`."
                 " There are race conditions and possibly better performance."
             )
             gpu_["do_copy_in_default_stream"] = False
         providers = [
             ("CUDAExecutionProvider", gpu_),
             "CPUExecutionProvider",
         ]
     else:
         providers = ort.get_available_providers()
     return providers  # type: ignore[return-value]
 def testRunSparseOutputOnly(self):
     """
     Try running models using the new run_with_ort_values
     sparse_initializer_as_output.onnx - requires no inputs, but only one output
     that comes from the initializer
     """
     # The below values are a part of the model
     dense_shape = [3, 3]
     values = np.array(
         [1.764052391052246, 0.40015721321105957, 0.978738009929657],
         np.float)
     indices = np.array([2, 3, 5], np.int64)
     sess = onnxrt.InferenceSession(
         get_name("sparse_initializer_as_output.onnx"),
         providers=onnxrt.get_available_providers(),
     )
     res = sess.run_with_ort_values(["values"], {})
     self.assertEqual(len(res), 1)
     ort_value = res[0]
     self.assertTrue(isinstance(ort_value, onnxrt.OrtValue))
     sparse_output = ort_value.as_sparse_tensor()
     self.assertTrue(isinstance(sparse_output, onnxrt.SparseTensor))
     self.assertEqual(dense_shape, sparse_output.dense_shape())
     self.assertTrue(np.array_equal(values, sparse_output.values()))
     self.assertTrue(
         np.array_equal(indices,
                        sparse_output.as_coo_rep().indices()))
 def test_run_model_tree_ensemble_aionnxml_3(self):
     available_providers = onnxrt.get_available_providers()
     # Checks onnxruntime can load and execute TreeEnsembleRegressor with double threashold.
     model = get_name("tree_ensemble_as_tensor.onnx")
     # first threshold of the tree is 1.7999999523162842
     # all number 1.79* are the same once converting to float32.
     # predictions must be the same with float32 and different with float64.
     iris = np.array(
         [
             [0, 1, 1.7999999523162842, 3],
             [0, 1, 1.7999999523, 3],
             [0, 1, 1.79999995232, 3],
         ],
         dtype=np.float64,
     )
     sess = onnxrt.InferenceSession(model, providers=available_providers)
     got = sess.run(None, {"X": iris})
     self.assertEqual(got[0].dtype, np.float64)
     self.assertEqual(got[0].shape, (3, 1))
     res64 = got[0].tolist()
     self.assertEqual(res64, [[0.7284910678863525], [0.7284910678863525], [0.9134243130683899]])
     iris = np.array(
         [
             [0, 1, 1.7999999523162842, 3],
             [0, 1, 1.7999999523, 3],
             [0, 1, 1.79999995232, 3],
         ],
         dtype=np.float32,
     )
     got = sess.run(None, {"X": iris.astype(np.float64)})
     self.assertEqual(got[0].dtype, np.float64)
     self.assertEqual(got[0].shape, (3, 1))
     res32 = got[0].tolist()
     self.assertEqual(res32, [[0.7284910678863525], [0.7284910678863525], [0.7284910678863525]])
    def test_bind_input_and_preallocated_output(self):
        input = self.create_ortvalue_input_on_gpu()

        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
        io_binding = session.io_binding()
        
        # Bind input to CUDA
        io_binding.bind_input('X', 'cuda', 0, np.float32, [3, 2], input.data_ptr())

        # Bind output to CUDA
        output = self.create_uninitialized_ortvalue_input_on_gpu()
        io_binding.bind_output('Y', 'cuda', 0, np.float32, [3, 2], output.data_ptr())

        # Sync if different CUDA streams
        io_binding.synchronize_inputs()

        # Invoke Run
        session.run_with_iobinding(io_binding)

        # Sync if different CUDA streams
        io_binding.synchronize_outputs()
        
        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
        ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
        # Validate results
        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals))
        
        # Validate if ORT actually wrote to pre-allocated buffer by copying the Torch allocated buffer
        # to the host and validating its contents
        ort_output_vals_in_cpu = output.numpy()
        # Validate results
        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals_in_cpu))
    def test_pytorch_model_0_gpu(self):
        if 'CUDAExecutionProvider' not in onnxruntime.get_available_providers(
        ):
            print("skip test_pytorch_model_0_gpu since no gpu found")
            return

        input = BERT_TEST_MODELS['bert_pytorch_0']
        bert_model = optimize_model(input,
                                    'bert',
                                    gpu_only=True,
                                    num_heads=2,
                                    hidden_size=8,
                                    sequence_length=10,
                                    input_int32=False,
                                    float16=False)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'FastGelu': 12,
            'Gelu': 0,
            'BiasGelu': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
    def test_run_model_mlnet(self):
        available_providers = onnxrt.get_available_providers()

        # The Windows GPU CI pipeline builds the wheel with both CUDA and DML enabled and ORT does not support cases
        # where one node is asigned to CUDA and one node to DML as it doesn't have the data transfer capabilities to deal with
        # potentially different device memory. Hence, use a session with only DML and CPU (excluding CUDA) for this test as it breaks
        # with both CUDA and DML registered.
        if ('CUDAExecutionProvider' in available_providers and 'DmlExecutionProvider' in available_providers):
            sess = onnxrt.InferenceSession(get_name("mlnet_encoder.onnx"), None, ['DmlExecutionProvider', 'CPUExecutionProvider'])
        else:
            sess = onnxrt.InferenceSession(get_name("mlnet_encoder.onnx"))

        names = [_.name for _ in sess.get_outputs()]
        self.assertEqual(['C00', 'C12'], names)
        c0 = np.array([5.], dtype=np.float32).reshape(1, 1)

        c1 = np.array([b'A\0A\0', b"B\0B\0", b"C\0C\0"], np.void).reshape(1, 3)
        res = sess.run(None, {'C0': c0, 'C1': c1})
        mat = res[1]
        total = mat.sum()
        self.assertEqual(total, 2)
        self.assertEqual(list(mat.ravel()),
                         list(np.array([[[0., 0., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.]]]).ravel()))

        # In memory, the size of each element is fixed and equal to the
        # longest element. We cannot use bytes because numpy is trimming
        # every final 0 for strings and bytes before creating the array
        # (to save space). It does not have this behaviour for void
        # but as a result, numpy does not know anymore the size
        # of each element, they all have the same size.
        c1 = np.array([b'A\0A\0\0', b"B\0B\0", b"C\0C\0"], np.void).reshape(1, 3)
        res = sess.run(None, {'C0': c0, 'C1': c1})
        mat = res[1]
        total = mat.sum()
        self.assertEqual(total, 0)
Example #17
0
 def testGetProviders(self):
     self.assertTrue('CPUExecutionProvider' in onnxrt.get_available_providers())
     # get_all_providers() returns the default EP order from highest to lowest.
     # CPUExecutionProvider should always be last.
     self.assertTrue('CPUExecutionProvider' == onnxrt.get_all_providers()[-1])
     sess = onnxrt.InferenceSession(get_name("mul_1.onnx"))
     self.assertTrue('CPUExecutionProvider' in sess.get_providers())
    def testLabelEncoder(self):
        sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers())
        input_name = sess.get_inputs()[0].name
        self.assertEqual(input_name, "input")
        input_type = str(sess.get_inputs()[0].type)
        self.assertEqual(input_type, "tensor(string)")
        input_shape = sess.get_inputs()[0].shape
        self.assertEqual(input_shape, [1, 1])
        output_name = sess.get_outputs()[0].name
        self.assertEqual(output_name, "variable")
        output_type = sess.get_outputs()[0].type
        self.assertEqual(output_type, "tensor(int64)")
        output_shape = sess.get_outputs()[0].shape
        self.assertEqual(output_shape, [1, 1])

        # Array
        x = np.array([["4"]])
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([[3]], dtype=np.int64)
        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)

        # Python type
        x = np.array(["4"], ndmin=2)
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([3], ndmin=2, dtype=np.int64)
        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)

        x = np.array(["4"], ndmin=2, dtype=object)
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([3], ndmin=2, dtype=np.int64)
        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
    def testRunModelMultipleThreads(self):
        available_providers = onnxrt.get_available_providers()

        # Skip this test for a "pure" DML onnxruntime python wheel. We keep this test enabled for instances where both DML and CUDA
        # EPs are available (Windows GPU CI pipeline has this config) - this test will pass because CUDA has higher precendence than DML
        # and the nodes are assigned to only the CUDA EP (which supports this test)
        if ('DmlExecutionProvider' in available_providers
                and not 'CUDAExecutionProvider' in available_providers):
            print(
                "Skipping testRunModelMultipleThreads as the DML EP does not support calling Run() on different threads using the same session object "
            )
        else:
            so = onnxrt.SessionOptions()
            so.log_verbosity_level = 1
            so.logid = "MultiThreadsTest"
            sess = onnxrt.InferenceSession(get_name("mul_1.onnx"),
                                           sess_options=so)
            ro1 = onnxrt.RunOptions()
            ro1.logid = "thread1"
            t1 = threading.Thread(target=self.run_model, args=(sess, ro1))
            ro2 = onnxrt.RunOptions()
            ro2.logid = "thread2"
            t2 = threading.Thread(target=self.run_model, args=(sess, ro2))
            t1.start()
            t2.start()
            t1.join()
            t2.join()
Example #20
0
def convert_onnx_models_to_ort():
    args = parse_args()

    model_path_or_dir = args.model_path_or_dir.resolve()
    custom_op_library = args.custom_op_library.resolve(
    ) if args.custom_op_library else None

    if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file():
        raise FileNotFoundError(
            "Model path '{}' is not a file or directory.".format(
                model_path_or_dir))

    if custom_op_library and not custom_op_library.is_file():
        raise FileNotFoundError(
            "Unable to find custom operator library '{}'".format(
                custom_op_library))

    if args.use_nnapi and 'NnapiExecutionProvider' not in ort.get_available_providers(
    ):
        raise ValueError(
            'The NNAPI Execution Provider was not included in this build of ONNX Runtime.'
        )

    _convert(model_path_or_dir, args.optimization_level, args.use_nnapi,
             custom_op_library, args.save_optimized_onnx_model)

    _create_config_file_from_ort_models(model_path_or_dir,
                                        args.enable_type_reduction)
Example #21
0
    def test_pytorch_model_0_gpu_onnxruntime(self):
        if 'CUDAExecutionProvider' not in onnxruntime.get_available_providers(
        ):
            print(
                "skip test_pytorch_model_0_gpu_onnxruntime since no gpu found")
            return

        input = _get_test_model_path('bert_pytorch_0')
        output = 'temp.onnx'
        optimize_by_onnxruntime(input,
                                use_gpu=True,
                                optimized_model_path=output)
        model = ModelProto()
        with open(output, "rb") as f:
            model.ParseFromString(f.read())
        os.remove(output)
        bert_model = OnnxModel(model)
        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'Gelu': 0,
            'FastGelu': 12,
            'BiasGelu': 0
        }
        self.verify_node_count(bert_model, expected_node_count,
                               'test_pytorch_model_0_gpu_onnxruntime')
    def test_bind_input_to_cpu_arr(self):
        input = self.create_numpy_input()

        session = onnxrt.InferenceSession(
            get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
        io_binding = session.io_binding()

        # Bind Numpy object (input) that's on CPU to wherever the model needs it
        io_binding.bind_cpu_input("X", self.create_numpy_input())

        # Bind output to CPU
        io_binding.bind_output("Y")

        # Invoke Run
        session.run_with_iobinding(io_binding)

        # Sync if different CUDA streams
        io_binding.synchronize_outputs()

        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
        ort_output = io_binding.copy_outputs_to_cpu()[0]

        # Validate results
        self.assertTrue(
            np.array_equal(self.create_expected_output(), ort_output))
    def test_bind_input_only(self):
        input = self.create_ortvalue_input_on_gpu()

        session = onnxrt.InferenceSession(
            get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
        io_binding = session.io_binding()

        # Bind input to CUDA
        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2],
                              input.data_ptr())

        # Sync if different CUDA streams
        io_binding.synchronize_inputs()

        # Bind output to CPU
        io_binding.bind_output("Y")

        # Invoke Run
        session.run_with_iobinding(io_binding)

        # Sync if different CUDA streams
        io_binding.synchronize_outputs()

        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
        ort_output = io_binding.copy_outputs_to_cpu()[0]

        # Validate results
        self.assertTrue(
            np.array_equal(self.create_expected_output(), ort_output))
Example #24
0
    def testDictVectorizer(self):
        sess = onnxrt.InferenceSession(
            get_name("pipeline_vectorize.onnx"),
            providers=onnxrt.get_available_providers())
        input_name = sess.get_inputs()[0].name
        self.assertEqual(input_name, "float_input")
        input_type = str(sess.get_inputs()[0].type)
        self.assertEqual(input_type, "map(int64,tensor(float))")
        input_shape = sess.get_inputs()[0].shape
        self.assertEqual(input_shape, [])
        output_name = sess.get_outputs()[0].name
        self.assertEqual(output_name, "variable1")
        output_type = sess.get_outputs()[0].type
        self.assertEqual(output_type, "tensor(float)")
        output_shape = sess.get_outputs()[0].shape
        self.assertEqual(output_shape, [1, 1])

        # Python type
        x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([[49.752754]], dtype=np.float32)
        np.testing.assert_allclose(output_expected,
                                   res[0],
                                   rtol=1e-05,
                                   atol=1e-08)

        xwrong = x.copy()
        xwrong["a"] = 5.6
        try:
            res = sess.run([output_name], {input_name: xwrong})
        except RuntimeError as e:
            self.assertIn(
                "Unexpected key type  <class 'str'>, it cannot be linked to C type int64_t",
                str(e))

        # numpy type
        x = {np.int64(k): np.float32(v) for k, v in x.items()}
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([[49.752754]], dtype=np.float32)
        np.testing.assert_allclose(output_expected,
                                   res[0],
                                   rtol=1e-05,
                                   atol=1e-08)

        x = {np.int64(k): np.float64(v) for k, v in x.items()}
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([[49.752754]], dtype=np.float32)
        np.testing.assert_allclose(output_expected,
                                   res[0],
                                   rtol=1e-05,
                                   atol=1e-08)

        x = {np.int32(k): np.float64(v) for k, v in x.items()}
        res = sess.run([output_name], {input_name: x})
        output_expected = np.array([[49.752754]], dtype=np.float32)
        np.testing.assert_allclose(output_expected,
                                   res[0],
                                   rtol=1e-05,
                                   atol=1e-08)
Example #25
0
def load(
    tag: t.Union[str, Tag],
    backend: t.Optional[str] = "onnxruntime",
    providers: t.Optional[t.Union["_ProviderType", "_GPUProviderType"]] = None,
    session_options: t.Optional["ort.SessionOptions"] = None,  # type: ignore
    model_store: "ModelStore" = Provide[BentoMLContainer.model_store],
) -> "ort.InferenceSession":
    """
    Load a model from BentoML local modelstore with given name.

    Args:
        tag (:code:`Union[str, Tag]`):
            Tag of a saved model in BentoML local modelstore.
        backend (:code:`str`, `optional`, default to :code:`onnxruntime`):
            Different backend runtime supported by ONNX. Currently only accepted :obj:`onnxruntime`
            and :obj:`onnxruntime-gpu`.
        providers (`List[Union[str, Tuple[str, Dict[str, Any]]`, `optional`, default to :code:`None`):
            Different providers provided by users. By default BentoML will use :func:`onnxruntime.get_available_providers`
            when loading a model.
        session_options (`onnxruntime.SessionOptions`, `optional`, default to :code:`None`):
            SessionOptions per use case. If not specified, then default to :code:`None`.
        model_store (:mod:`~bentoml._internal.models.store.ModelStore`, default to :mod:`BentoMLContainer.model_store`):
            BentoML modelstore, provided by DI Container.

    Returns:
        :obj:`onnxruntime.InferenceSession`: an instance of ONNX model from BentoML modelstore.

    Examples:

    .. code-block:: python

        import bentoml

        model = bentoml.onnx.load(tag)

    """  # noqa
    model = model_store.get(tag)
    if model.info.module not in (MODULE_NAME, __name__):
        raise BentoMLException(
            f"Model {tag} was saved with module {model.info.module}, failed loading with {MODULE_NAME}."
        )
    model_file = model.path_of(f"{SAVE_NAMESPACE}{ONNX_EXT}")

    if backend not in SUPPORTED_ONNX_BACKEND:
        raise BentoMLException(
            f"'{backend}' runtime is currently not supported for ONNXModel"
        )
    if providers:
        if not all(i in ort.get_all_providers() for i in flatten_list(providers)):
            raise BentoMLException(f"'{providers}' cannot be parsed by `onnxruntime`")
    else:
        providers = ort.get_available_providers()

    return ort.InferenceSession(
        model_file,
        sess_options=session_options,
        providers=providers,
    )
 def testSetProviders(self):
     if 'CUDAExecutionProvider' in onnxrt.get_available_providers():
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"))
         # confirm that CUDA Provider is in list of registered providers.
         self.assertTrue('CUDAExecutionProvider' in sess.get_providers())
         # reset the session and register only CPU Provider.
         sess.set_providers(['CPUExecutionProvider'])
         # confirm only CPU Provider is registered now.
         self.assertEqual(['CPUExecutionProvider'], sess.get_providers())
Example #27
0
def optimize_by_onnxruntime(
    onnx_model_path: str,
    use_gpu: bool = False,
    optimized_model_path: Optional[str] = None,
    opt_level: Optional[int] = 99,
    disabled_optimizers=[],
) -> str:
    """
    Use onnxruntime to optimize model.

    Args:
        onnx_model_path (str): the path of input onnx model.
        use_gpu (bool): whether the optimized model is targeted to run in GPU.
        optimized_model_path (str or None): the path of optimized model.
        opt_level (int): graph optimization level.
        disabled_optimizers (List[str]): a list of names of disabled optimizers
    Returns:
        optimized_model_path (str): the path of optimized model
    """
    assert opt_level in [1, 2, 99]
    import onnxruntime

    if use_gpu and "CUDAExecutionProvider" not in onnxruntime.get_available_providers():
        logger.error("There is no gpu for onnxruntime to do optimization.")
        return onnx_model_path

    sess_options = onnxruntime.SessionOptions()
    if opt_level == 1:
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
    elif opt_level == 2:
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
    else:
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL

    if optimized_model_path is None:
        path_prefix = onnx_model_path[:-5]  # remove .onnx suffix
        optimized_model_path = "{}_o{}_{}.onnx".format(path_prefix, opt_level, "gpu" if use_gpu else "cpu")

    sess_options.optimized_model_filepath = optimized_model_path

    kwargs = {}
    if disabled_optimizers:
        kwargs["disabled_optimizers"] = disabled_optimizers

    if not use_gpu:
        session = onnxruntime.InferenceSession(
            onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs
        )
    else:
        session = onnxruntime.InferenceSession(
            onnx_model_path, sess_options, providers=["CUDAExecutionProvider"], **kwargs
        )
        assert "CUDAExecutionProvider" in session.get_providers()  # Make sure there is GPU

    assert os.path.exists(optimized_model_path) and os.path.isfile(optimized_model_path)
    logger.debug("Save optimized model by onnxruntime to {}".format(optimized_model_path))
    return optimized_model_path
Example #28
0
def _check_providers(providers):
    providers = providers or []
    if not isinstance(providers, (list, tuple)):
        providers = [providers]
    available_providers = onnxruntime.get_available_providers()
    unavailable = set(providers) - set(available_providers)
    if unavailable:
        raise RuntimeError(f"Unavailable providers {unavailable}")
    return providers
Example #29
0
def perf_test(rnn_type, num_threads, input_dim, hidden_dim, bidirectional, layers, seq_len, batch_size, top_n=5, min_duration_seconds=10):
    model_name = '{}_i{}_h{}_{}_l{}_{}.onnx'.format(rnn_type, input_dim, hidden_dim,
                                                    'bi' if bidirectional else '',
                                                    layers,
                                                    'batched' if batch_size > 1 else 'no_batch')

    generate_model(rnn_type, input_dim, hidden_dim, bidirectional, layers, model_name, batch_size == 1)
    feeds = {'input':np.random.rand(seq_len, batch_size, input_dim).astype(np.float32)}

    # run original model in CPU provider, using all threads
    # there are some local thread pool inside LSTM/GRU CPU kernel
    # that cannot be controlled by OMP or intra_op_num_threads
    sess = onnxruntime.InferenceSession(model_name, providers=['CPUExecutionProvider'])
    count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
    avg_rnn = top_n_avg(per_iter_cost, top_n)
    print('perf_rnn (with default threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(model_name, count, top_n, avg_rnn))

    # run converted model in Nuphar, using specified threads
    with ScopedSetNumThreads(num_threads) as scoped_set_num_threads:
        # run Scan model converted from original in Nuphar
        from .model_editor import convert_to_scan_model
        from ..tools.symbolic_shape_infer import SymbolicShapeInference
        scan_model_name = os.path.splitext(model_name)[0] + '_scan.onnx'
        convert_to_scan_model(model_name, scan_model_name)
        # note that symbolic shape inference is needed because model has symbolic batch dim, thus init_state is ConstantOfShape
        onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(scan_model_name)), scan_model_name)
        sess = onnxruntime.InferenceSession(scan_model_name, providers=onnxruntime.get_available_providers())
        count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
        avg_scan = top_n_avg(per_iter_cost, top_n)
        print('perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, scan_model_name, count, top_n, avg_scan))

        # quantize Scan model to int8 and run in Nuphar
        from .model_quantizer import convert_matmul_model
        int8_model_name = os.path.splitext(model_name)[0] + '_int8.onnx'
        convert_matmul_model(scan_model_name, int8_model_name)
        onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(int8_model_name)), int8_model_name)
        sess = onnxruntime.InferenceSession(int8_model_name, providers=onnxruntime.get_available_providers())
        count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
        avg_int8 = top_n_avg(per_iter_cost, top_n)
        print('perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, int8_model_name, count, top_n, avg_int8))

    return avg_rnn, avg_scan, avg_int8
def convert_onnx_models_to_ort():
    args = parse_args()

    model_path_or_dir = args.model_path_or_dir.resolve()
    custom_op_library = args.custom_op_library.resolve(
    ) if args.custom_op_library else None

    if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file():
        raise FileNotFoundError(
            "Model path '{}' is not a file or directory.".format(
                model_path_or_dir))

    if custom_op_library and not custom_op_library.is_file():
        raise FileNotFoundError(
            "Unable to find custom operator library '{}'".format(
                custom_op_library))

    if args.use_nnapi and 'NnapiExecutionProvider' not in ort.get_available_providers(
    ):
        raise ValueError(
            'The NNAPI Execution Provider was not included in this build of ONNX Runtime.'
        )

    if args.use_coreml:
        if not is_macOS():
            # Check if the script is run on a Mac Device in this case
            raise ValueError(
                '--use_coreml option requires a MacOS environment.')
        if 'CoreMLExecutionProvider' not in ort.get_available_providers():
            raise ValueError(
                'The CoreML Execution Provider was not included in this build of ONNX Runtime.'
            )

    _convert(model_path_or_dir, args.optimization_level, args.use_nnapi,
             args.use_coreml, custom_op_library,
             args.save_optimized_onnx_model, args.allow_conversion_failures)

    _create_config_file_from_ort_models(model_path_or_dir,
                                        args.optimization_level,
                                        args.enable_type_reduction)