def prepare_environment(cache_dir, output_dir, use_gpu, provider=None): if cache_dir and not os.path.exists(cache_dir): os.makedirs(cache_dir) if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) import onnxruntime if use_gpu: if provider == "dml": assert ( "DmlExecutionProvider" in onnxruntime.get_available_providers() ), "Please install onnxruntime-directml package to test GPU inference." else: assert ( "CUDAExecutionProvider" in onnxruntime.get_available_providers() ), "Please install onnxruntime-gpu package to test GPU inference." import transformers logger.info(f"PyTorch Version:{torch.__version__}") logger.info(f"Transformers Version:{transformers.__version__}") logger.info(f"Onnxruntime Version:{onnxruntime.__version__}") # Support three major versions of PyTorch and OnnxRuntime, and up to 6 months of transformers. from packaging import version assert version.parse(torch.__version__) >= version.parse("1.5.0") assert version.parse(transformers.__version__) >= version.parse("3.0.0") assert version.parse(onnxruntime.__version__) >= version.parse("1.4.0")
def convert_onnx_models_to_ort(): args = parse_args() model_path_or_dir = args.model_path_or_dir.resolve() custom_op_library = args.custom_op_library.resolve() if args.custom_op_library else None if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file(): raise FileNotFoundError("Model path '{}' is not a file or directory.".format(model_path_or_dir)) if custom_op_library and not custom_op_library.is_file(): raise FileNotFoundError("Unable to find custom operator library '{}'".format(custom_op_library)) if args.use_nnapi and 'NnapiExecutionProvider' not in ort.get_available_providers(): raise ValueError('The NNAPI Execution Provider was not included in this build of ONNX Runtime.') if args.use_coreml and 'CoreMLExecutionProvider' not in ort.get_available_providers(): raise ValueError('The CoreML Execution Provider was not included in this build of ONNX Runtime.') session_options_config_entries = {} if args.nnapi_partitioning_stop_ops is not None: session_options_config_entries["ep.nnapi.partitioning_stop_ops"] = args.nnapi_partitioning_stop_ops if args.target_platform == 'arm': session_options_config_entries["session.qdqisint8allowed"] = "1" else: session_options_config_entries["session.qdqisint8allowed"] = "0" for optimization_level in args.optimization_level: print(f"Converting models and creating configuration file for optimization level '{optimization_level}'") _convert(model_path_or_dir, optimization_level, args.use_nnapi, args.use_coreml, custom_op_library, args.save_optimized_onnx_model, args.allow_conversion_failures, args.target_platform, session_options_config_entries) _create_config_file_from_ort_models(model_path_or_dir, optimization_level, args.enable_type_reduction)
def main(): args = parse_arguments() if args.test_times == 0: args.test_times = max(1, int(1000 / args.samples)) if args.use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()): print( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return elif (not args.use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): print( "Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance." ) average_latency = {} contiguous_latency = run_performance(average_latency, args.model, args.batch_size, args.sequence_length, args.use_gpu, args.samples, args.test_times, args.seed, args.verbose, args.all) if average_latency is None: return summary_file = os.path.join( Path(args.model).parent, "perf_results_{}_B{}_S{}_{}.txt".format( 'GPU' if args.use_gpu else 'CPU', args.batch_size, args.sequence_length, datetime.now().strftime("%Y%m%d-%H%M%S"))) with open(summary_file, 'w+', newline='') as tsv_file: tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n') headers = None for key, latency in average_latency.items(): params = key.split(',') if headers is None: headers = ["Latency(ms)", "Throughput(QPS)"] headers.extend([x.split('=')[0] for x in params]) tsv_writer.writerow(headers) # include the extra latency of array conversion if required. if args.inclusive and 'contiguous=True' in params: latency += contiguous_latency throughput = args.batch_size * (1000 / latency) values = [format(latency, '.2f'), format(throughput, '.2f')] values.extend([x.split('=')[1] for x in params]) tsv_writer.writerow(values) print("Test summary is saved to", summary_file)
def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None): # Import onnxruntime shall be after OpenMP environment variable setting. # So we put the import in function to delay importing instead of top of this script. import onnxruntime if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()): print( "Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) elif (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): print( "Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance." ) if intra_op_num_threads is None and graph_optimization_level is None: session = onnxruntime.InferenceSession(model_path) else: execution_providers = ['CPUExecutionProvider'] if not use_gpu else [ 'CUDAExecutionProvider', 'CPUExecutionProvider' ] sess_options = onnxruntime.SessionOptions() sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL if graph_optimization_level is None: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL elif graph_optimization_level == 0: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL elif graph_optimization_level == 1: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC elif graph_optimization_level == 2: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED elif graph_optimization_level == 99: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = graph_optimization_level if intra_op_num_threads is not None: sess_options.intra_op_num_threads = intra_op_num_threads session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers) if use_gpu: assert 'CUDAExecutionProvider' in session.get_providers() return session
def testRunContribSparseMatMul(self): ''' Mutliple sparse COO tensor to dense ''' common_shape = [9,9] # inputs and oputputs same shape A_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0], np.float32) # 2-D index A_indices = np.array([0, 1, 0, 2, 0, 6, 0, 7, 0, 8, 1, 0, 1, 1, 1, 2, 1, 6, 1, 7, 1, 8, 2, 0, 2, 1, 2, 2, 2, 6, 2, 7, 2, 8, 3, 3, 3, 4, 3, 5, 3, 6, 3, 7, 3, 8, 4, 3, 4, 4, 4, 5, 4, 6, 4, 7, 4, 8, 5, 3, 5, 4, 5, 5, 5, 6, 5, 7, 5, 8, 6, 0, 6, 1, 6, 2, 6, 3, 6, 4, 6, 5, 7, 0, 7, 1, 7, 2, 7, 3, 7, 4, 7, 5, 8, 0, 8, 1, 8, 2, 8, 3, 8, 4, 8, 5], np.int64).reshape((len(A_values), 2)) cpu_device = onnxrt.OrtDevice.make('cpu', 0) sparse_tensor = onnxrt.SparseTensor.sparse_coo_from_numpy(common_shape, A_values, A_indices, cpu_device) A_ort_value = onnxrt.OrtValue.ort_value_from_sparse_tensor(sparse_tensor) B_data = np.array([0, 1, 2, 0, 0, 0, 3, 4, 5, 6, 7, 8, 0, 0, 0, 9, 10, 11, 12, 13, 14, 0, 0, 0, 15, 16, 17, 0, 0, 0, 18, 19, 20, 21, 22, 23, 0, 0, 0, 24, 25, 26, 27, 28, 29, 0, 0, 0, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 0, 0, 0, 42, 43, 44, 45, 46, 47, 0, 0, 0, 48, 49, 50, 51, 52, 53, 0, 0, 0], np.float32).reshape(common_shape) B_ort_value = onnxrt.OrtValue.ortvalue_from_numpy(B_data) Y_result = np.array([546, 561, 576, 552, 564, 576, 39, 42, 45, 1410, 1461, 1512, 1362, 1392, 1422, 201, 222, 243, 2274, 2361, 2448, 2172, 2220, 2268, 363, 402, 441, 2784, 2850, 2916, 4362, 4485, 4608, 1551, 1608, 1665, 3540, 3624, 3708, 5604, 5763, 5922, 2037, 2112, 2187, 4296, 4398, 4500, 6846, 7041, 7236, 2523, 2616, 2709, 678, 789, 900, 2892, 3012, 3132, 4263, 4494, 4725, 786, 915, 1044, 3324, 3462, 3600, 4911, 5178, 5445, 894, 1041, 1188, 3756, 3912, 4068, 5559, 5862, 6165], np.float).reshape(common_shape) sess = onnxrt.InferenceSession(get_name("sparse_to_dense_matmul.onnx"), providers=onnxrt.get_available_providers()) res = sess.run_with_ort_values(["dense_Y"], { "sparse_A" : A_ort_value, "dense_B" : B_ort_value }) self.assertEqual(len(res), 1) ort_value = res[0] self.assertTrue(isinstance(ort_value, onnxrt.OrtValue)) self.assertTrue(ort_value.is_tensor()) self.assertEqual(ort_value.data_type(), "tensor(float)") self.assertEqual(ort_value.shape(), common_shape) result = ort_value.numpy() self.assertEqual(list(result.shape), common_shape) self.assertTrue(np.array_equal(Y_result, result))
def testZipMapStringFloat(self): sess = onnxrt.InferenceSession(get_name("zipmap_stringfloat.onnx"), providers=onnxrt.get_available_providers()) x = np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], dtype=np.float32).reshape((2, 3)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "X") x_type = sess.get_inputs()[0].type self.assertEqual(x_type, 'tensor(float)') output_name = sess.get_outputs()[0].name self.assertEqual(output_name, "Z") output_type = sess.get_outputs()[0].type self.assertEqual(output_type, 'seq(map(string,tensor(float)))') output_expected = [{ 'class2': 0.0, 'class1': 1.0, 'class3': 3.0 }, { 'class2': 23.0, 'class1': 44.0, 'class3': 11.0 }] res = sess.run([output_name], {x_name: x}) self.assertEqual(output_expected, res[0])
def ort_session(model_name: str) -> ort.InferenceSession: if model_name == "u2netp": md5 = "8e83ca70e441ab06c318d82300c84806" url = "https://drive.google.com/uc?id=1tNuFmLv0TSNDjYIkjEdeH1IWKQdUA4HR" elif model_name == "u2net": md5 = "60024c5c889badc19c04ad937298a77b" url = "https://drive.google.com/uc?id=1tCU5MM1LhRgGou5OpmpjBQbSrYIUoYab" elif model_name == "u2net_human_seg": md5 = "c09ddc2e0104f800e3e1bb4652583d1f" url = "https://drive.google.com/uc?id=1ZfqwVxu-1XWC1xU1GHIP-FM_Knd_AX5j" else: assert AssertionError( "Choose between u2net, u2netp or u2net_human_seg") home = os.getenv("U2NET_HOME", os.path.join("~", ".u2net")) path = Path(home).expanduser() / f"{model_name}.onnx" path.parents[0].mkdir(parents=True, exist_ok=True) if not (path.exists() and hashlib.md5(path.read_bytes()).hexdigest() == md5): with redirect_stdout(sys.stderr): gdown.download(url, str(path), use_cookies=False) return ort.InferenceSession(str(path), providers=ort.get_available_providers())
def run_test( self, model, input=None, custom_opsets=None, batch_size=2, rtol=0.001, atol=1e-7, do_constant_folding=True, dynamic_axes=None, test_with_inputs=None, input_names=None, output_names=None, ): model.eval() if input is None: input = torch.randn(batch_size, 3, 224, 224, requires_grad=True) with torch.no_grad(): if isinstance(input, torch.Tensor): input = (input,) # In-place operators will update input tensor data as well. # Thus inputs are replicated before every forward call. input_copy = copy.deepcopy(input) output = model(*input_copy) if isinstance(output, torch.Tensor): output = (output,) # export the model to ONNX f = io.BytesIO() torch.onnx.export( model, input_copy, f, opset_version=self.opset_version, do_constant_folding=do_constant_folding, keep_initializers_as_inputs=self.keep_initializers_as_inputs, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names, custom_opsets=custom_opsets, ) # compute onnxruntime output prediction ort_sess = onnxruntime.InferenceSession(f.getvalue(), providers=onnxruntime.get_available_providers()) input_copy = copy.deepcopy(input) ort_test_with_input(ort_sess, input_copy, output, rtol, atol) # if additional test inputs are provided run the onnx # model with these inputs and check the outputs if test_with_inputs is not None: for test_input in test_with_inputs: if isinstance(test_input, torch.Tensor): test_input = (test_input,) test_input_copy = copy.deepcopy(test_input) output = model(*test_input_copy) if isinstance(output, torch.Tensor): output = (output,) ort_test_with_input(ort_sess, test_input, output, rtol, atol)
def run_onnxruntime(onnx_model_path, use_gpu, optimized_model_path=None): if use_gpu and 'CUDAExecutionProvider' not in onnxruntime.get_available_providers( ): logger.error("There is no gpu for onnxruntime to do optimization.") sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED if optimized_model_path is None: path_prefix = onnx_model_path[:-5] #remove .onnx suffix optimized_model_path = "{}_ort_{}.onnx".format( path_prefix, "gpu" if use_gpu else "cpu") sess_options.optimized_model_filepath = optimized_model_path if not use_gpu: session = onnxruntime.InferenceSession( onnx_model_path, sess_options, providers=['CPUExecutionProvider']) else: session = onnxruntime.InferenceSession(onnx_model_path, sess_options) assert 'CUDAExecutionProvider' in session.get_providers( ) # Make sure there is GPU assert os.path.exists(optimized_model_path) and os.path.isfile( optimized_model_path) logger.info("Save optimized model by onnxruntime to {}".format( optimized_model_path)) return optimized_model_path
def optimize_by_model_type(optimization_config, model_type=None): middle_path = os.path.join(tempfile.mkdtemp(), 'middle_optimized.onnx') cmd = "python -m onnxruntime.transformers.optimizer --input %s --output %s " % (optimization_config.model_path, middle_path) if optimization_config.transformer_args: cmd += optimization_config.transformer_args if model_type: cmd += " --model_type {}".format(model_type) logger.info("Running TransformersOptimizer with command {}".format(cmd)) if OLIVE_LOG_LEVEL == "INFO": ret = subprocess.run(cmd, shell=True) elif OLIVE_LOG_LEVEL == "WARNING": ret = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) else: ret = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if ret.returncode == 0: try: default_ep = "CUDAExecutionProvider" if "CUDAExecutionProvider" in ort.get_available_providers() else "CPUExecutionProvider" ort.InferenceSession(middle_path, providers=[default_ep]) logger.info("Transformers optimization finished with success") copy(middle_path, optimization_config.model_path) except Exception: logger.info("Invalid model after transformer optimization. Original model will be used.") else: logger.info("Transformers optimization failed. Original model will be used for optimization.")
def _get_default_providers( gpu_device_id: int, disable_copy_in_default_stream: bool, ) -> "_ProviderType": if gpu_device_id != -1: _, free = get_gpu_memory(gpu_device_id) gpu_ = { "device_id": gpu_device_id, "arena_extend_strategy": "kNextPowerOfTwo", "gpu_mem_limit": free, "cudnn_conv_algo_search": "EXHAUSTIVE", "do_copy_in_default_stream": True, } if disable_copy_in_default_stream: logger.warning( "`disable_copy_in_default_stream=True` will set `do_copy_in_default_stream=False`." " There are race conditions and possibly better performance." ) gpu_["do_copy_in_default_stream"] = False providers = [ ("CUDAExecutionProvider", gpu_), "CPUExecutionProvider", ] else: providers = ort.get_available_providers() return providers # type: ignore[return-value]
def testRunSparseOutputOnly(self): """ Try running models using the new run_with_ort_values sparse_initializer_as_output.onnx - requires no inputs, but only one output that comes from the initializer """ # The below values are a part of the model dense_shape = [3, 3] values = np.array( [1.764052391052246, 0.40015721321105957, 0.978738009929657], np.float) indices = np.array([2, 3, 5], np.int64) sess = onnxrt.InferenceSession( get_name("sparse_initializer_as_output.onnx"), providers=onnxrt.get_available_providers(), ) res = sess.run_with_ort_values(["values"], {}) self.assertEqual(len(res), 1) ort_value = res[0] self.assertTrue(isinstance(ort_value, onnxrt.OrtValue)) sparse_output = ort_value.as_sparse_tensor() self.assertTrue(isinstance(sparse_output, onnxrt.SparseTensor)) self.assertEqual(dense_shape, sparse_output.dense_shape()) self.assertTrue(np.array_equal(values, sparse_output.values())) self.assertTrue( np.array_equal(indices, sparse_output.as_coo_rep().indices()))
def test_run_model_tree_ensemble_aionnxml_3(self): available_providers = onnxrt.get_available_providers() # Checks onnxruntime can load and execute TreeEnsembleRegressor with double threashold. model = get_name("tree_ensemble_as_tensor.onnx") # first threshold of the tree is 1.7999999523162842 # all number 1.79* are the same once converting to float32. # predictions must be the same with float32 and different with float64. iris = np.array( [ [0, 1, 1.7999999523162842, 3], [0, 1, 1.7999999523, 3], [0, 1, 1.79999995232, 3], ], dtype=np.float64, ) sess = onnxrt.InferenceSession(model, providers=available_providers) got = sess.run(None, {"X": iris}) self.assertEqual(got[0].dtype, np.float64) self.assertEqual(got[0].shape, (3, 1)) res64 = got[0].tolist() self.assertEqual(res64, [[0.7284910678863525], [0.7284910678863525], [0.9134243130683899]]) iris = np.array( [ [0, 1, 1.7999999523162842, 3], [0, 1, 1.7999999523, 3], [0, 1, 1.79999995232, 3], ], dtype=np.float32, ) got = sess.run(None, {"X": iris.astype(np.float64)}) self.assertEqual(got[0].dtype, np.float64) self.assertEqual(got[0].shape, (3, 1)) res32 = got[0].tolist() self.assertEqual(res32, [[0.7284910678863525], [0.7284910678863525], [0.7284910678863525]])
def test_bind_input_and_preallocated_output(self): input = self.create_ortvalue_input_on_gpu() session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers()) io_binding = session.io_binding() # Bind input to CUDA io_binding.bind_input('X', 'cuda', 0, np.float32, [3, 2], input.data_ptr()) # Bind output to CUDA output = self.create_uninitialized_ortvalue_input_on_gpu() io_binding.bind_output('Y', 'cuda', 0, np.float32, [3, 2], output.data_ptr()) # Sync if different CUDA streams io_binding.synchronize_inputs() # Invoke Run session.run_with_iobinding(io_binding) # Sync if different CUDA streams io_binding.synchronize_outputs() # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here) ort_output_vals = io_binding.copy_outputs_to_cpu()[0] # Validate results self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals)) # Validate if ORT actually wrote to pre-allocated buffer by copying the Torch allocated buffer # to the host and validating its contents ort_output_vals_in_cpu = output.numpy() # Validate results self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals_in_cpu))
def test_pytorch_model_0_gpu(self): if 'CUDAExecutionProvider' not in onnxruntime.get_available_providers( ): print("skip test_pytorch_model_0_gpu since no gpu found") return input = BERT_TEST_MODELS['bert_pytorch_0'] bert_model = optimize_model(input, 'bert', gpu_only=True, num_heads=2, hidden_size=8, sequence_length=10, input_int32=False, float16=False) expected_node_count = { 'EmbedLayerNormalization': 1, 'Attention': 12, 'SkipLayerNormalization': 24, 'FastGelu': 12, 'Gelu': 0, 'BiasGelu': 0 } self.verify_node_count(bert_model, expected_node_count)
def test_run_model_mlnet(self): available_providers = onnxrt.get_available_providers() # The Windows GPU CI pipeline builds the wheel with both CUDA and DML enabled and ORT does not support cases # where one node is asigned to CUDA and one node to DML as it doesn't have the data transfer capabilities to deal with # potentially different device memory. Hence, use a session with only DML and CPU (excluding CUDA) for this test as it breaks # with both CUDA and DML registered. if ('CUDAExecutionProvider' in available_providers and 'DmlExecutionProvider' in available_providers): sess = onnxrt.InferenceSession(get_name("mlnet_encoder.onnx"), None, ['DmlExecutionProvider', 'CPUExecutionProvider']) else: sess = onnxrt.InferenceSession(get_name("mlnet_encoder.onnx")) names = [_.name for _ in sess.get_outputs()] self.assertEqual(['C00', 'C12'], names) c0 = np.array([5.], dtype=np.float32).reshape(1, 1) c1 = np.array([b'A\0A\0', b"B\0B\0", b"C\0C\0"], np.void).reshape(1, 3) res = sess.run(None, {'C0': c0, 'C1': c1}) mat = res[1] total = mat.sum() self.assertEqual(total, 2) self.assertEqual(list(mat.ravel()), list(np.array([[[0., 0., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.]]]).ravel())) # In memory, the size of each element is fixed and equal to the # longest element. We cannot use bytes because numpy is trimming # every final 0 for strings and bytes before creating the array # (to save space). It does not have this behaviour for void # but as a result, numpy does not know anymore the size # of each element, they all have the same size. c1 = np.array([b'A\0A\0\0', b"B\0B\0", b"C\0C\0"], np.void).reshape(1, 3) res = sess.run(None, {'C0': c0, 'C1': c1}) mat = res[1] total = mat.sum() self.assertEqual(total, 0)
def testGetProviders(self): self.assertTrue('CPUExecutionProvider' in onnxrt.get_available_providers()) # get_all_providers() returns the default EP order from highest to lowest. # CPUExecutionProvider should always be last. self.assertTrue('CPUExecutionProvider' == onnxrt.get_all_providers()[-1]) sess = onnxrt.InferenceSession(get_name("mul_1.onnx")) self.assertTrue('CPUExecutionProvider' in sess.get_providers())
def testLabelEncoder(self): sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers()) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "input") input_type = str(sess.get_inputs()[0].type) self.assertEqual(input_type, "tensor(string)") input_shape = sess.get_inputs()[0].shape self.assertEqual(input_shape, [1, 1]) output_name = sess.get_outputs()[0].name self.assertEqual(output_name, "variable") output_type = sess.get_outputs()[0].type self.assertEqual(output_type, "tensor(int64)") output_shape = sess.get_outputs()[0].shape self.assertEqual(output_shape, [1, 1]) # Array x = np.array([["4"]]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[3]], dtype=np.int64) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) # Python type x = np.array(["4"], ndmin=2) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) x = np.array(["4"], ndmin=2, dtype=object) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
def testRunModelMultipleThreads(self): available_providers = onnxrt.get_available_providers() # Skip this test for a "pure" DML onnxruntime python wheel. We keep this test enabled for instances where both DML and CUDA # EPs are available (Windows GPU CI pipeline has this config) - this test will pass because CUDA has higher precendence than DML # and the nodes are assigned to only the CUDA EP (which supports this test) if ('DmlExecutionProvider' in available_providers and not 'CUDAExecutionProvider' in available_providers): print( "Skipping testRunModelMultipleThreads as the DML EP does not support calling Run() on different threads using the same session object " ) else: so = onnxrt.SessionOptions() so.log_verbosity_level = 1 so.logid = "MultiThreadsTest" sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=so) ro1 = onnxrt.RunOptions() ro1.logid = "thread1" t1 = threading.Thread(target=self.run_model, args=(sess, ro1)) ro2 = onnxrt.RunOptions() ro2.logid = "thread2" t2 = threading.Thread(target=self.run_model, args=(sess, ro2)) t1.start() t2.start() t1.join() t2.join()
def convert_onnx_models_to_ort(): args = parse_args() model_path_or_dir = args.model_path_or_dir.resolve() custom_op_library = args.custom_op_library.resolve( ) if args.custom_op_library else None if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file(): raise FileNotFoundError( "Model path '{}' is not a file or directory.".format( model_path_or_dir)) if custom_op_library and not custom_op_library.is_file(): raise FileNotFoundError( "Unable to find custom operator library '{}'".format( custom_op_library)) if args.use_nnapi and 'NnapiExecutionProvider' not in ort.get_available_providers( ): raise ValueError( 'The NNAPI Execution Provider was not included in this build of ONNX Runtime.' ) _convert(model_path_or_dir, args.optimization_level, args.use_nnapi, custom_op_library, args.save_optimized_onnx_model) _create_config_file_from_ort_models(model_path_or_dir, args.enable_type_reduction)
def test_pytorch_model_0_gpu_onnxruntime(self): if 'CUDAExecutionProvider' not in onnxruntime.get_available_providers( ): print( "skip test_pytorch_model_0_gpu_onnxruntime since no gpu found") return input = _get_test_model_path('bert_pytorch_0') output = 'temp.onnx' optimize_by_onnxruntime(input, use_gpu=True, optimized_model_path=output) model = ModelProto() with open(output, "rb") as f: model.ParseFromString(f.read()) os.remove(output) bert_model = OnnxModel(model) expected_node_count = { 'EmbedLayerNormalization': 1, 'Attention': 12, 'SkipLayerNormalization': 24, 'Gelu': 0, 'FastGelu': 12, 'BiasGelu': 0 } self.verify_node_count(bert_model, expected_node_count, 'test_pytorch_model_0_gpu_onnxruntime')
def test_bind_input_to_cpu_arr(self): input = self.create_numpy_input() session = onnxrt.InferenceSession( get_name("mul_1.onnx"), providers=onnxrt.get_available_providers()) io_binding = session.io_binding() # Bind Numpy object (input) that's on CPU to wherever the model needs it io_binding.bind_cpu_input("X", self.create_numpy_input()) # Bind output to CPU io_binding.bind_output("Y") # Invoke Run session.run_with_iobinding(io_binding) # Sync if different CUDA streams io_binding.synchronize_outputs() # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here) ort_output = io_binding.copy_outputs_to_cpu()[0] # Validate results self.assertTrue( np.array_equal(self.create_expected_output(), ort_output))
def test_bind_input_only(self): input = self.create_ortvalue_input_on_gpu() session = onnxrt.InferenceSession( get_name("mul_1.onnx"), providers=onnxrt.get_available_providers()) io_binding = session.io_binding() # Bind input to CUDA io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr()) # Sync if different CUDA streams io_binding.synchronize_inputs() # Bind output to CPU io_binding.bind_output("Y") # Invoke Run session.run_with_iobinding(io_binding) # Sync if different CUDA streams io_binding.synchronize_outputs() # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here) ort_output = io_binding.copy_outputs_to_cpu()[0] # Validate results self.assertTrue( np.array_equal(self.create_expected_output(), ort_output))
def testDictVectorizer(self): sess = onnxrt.InferenceSession( get_name("pipeline_vectorize.onnx"), providers=onnxrt.get_available_providers()) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "float_input") input_type = str(sess.get_inputs()[0].type) self.assertEqual(input_type, "map(int64,tensor(float))") input_shape = sess.get_inputs()[0].shape self.assertEqual(input_shape, []) output_name = sess.get_outputs()[0].name self.assertEqual(output_name, "variable1") output_type = sess.get_outputs()[0].type self.assertEqual(output_type, "tensor(float)") output_shape = sess.get_outputs()[0].shape self.assertEqual(output_shape, [1, 1]) # Python type x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) xwrong = x.copy() xwrong["a"] = 5.6 try: res = sess.run([output_name], {input_name: xwrong}) except RuntimeError as e: self.assertIn( "Unexpected key type <class 'str'>, it cannot be linked to C type int64_t", str(e)) # numpy type x = {np.int64(k): np.float32(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) x = {np.int64(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) x = {np.int32(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
def load( tag: t.Union[str, Tag], backend: t.Optional[str] = "onnxruntime", providers: t.Optional[t.Union["_ProviderType", "_GPUProviderType"]] = None, session_options: t.Optional["ort.SessionOptions"] = None, # type: ignore model_store: "ModelStore" = Provide[BentoMLContainer.model_store], ) -> "ort.InferenceSession": """ Load a model from BentoML local modelstore with given name. Args: tag (:code:`Union[str, Tag]`): Tag of a saved model in BentoML local modelstore. backend (:code:`str`, `optional`, default to :code:`onnxruntime`): Different backend runtime supported by ONNX. Currently only accepted :obj:`onnxruntime` and :obj:`onnxruntime-gpu`. providers (`List[Union[str, Tuple[str, Dict[str, Any]]`, `optional`, default to :code:`None`): Different providers provided by users. By default BentoML will use :func:`onnxruntime.get_available_providers` when loading a model. session_options (`onnxruntime.SessionOptions`, `optional`, default to :code:`None`): SessionOptions per use case. If not specified, then default to :code:`None`. model_store (:mod:`~bentoml._internal.models.store.ModelStore`, default to :mod:`BentoMLContainer.model_store`): BentoML modelstore, provided by DI Container. Returns: :obj:`onnxruntime.InferenceSession`: an instance of ONNX model from BentoML modelstore. Examples: .. code-block:: python import bentoml model = bentoml.onnx.load(tag) """ # noqa model = model_store.get(tag) if model.info.module not in (MODULE_NAME, __name__): raise BentoMLException( f"Model {tag} was saved with module {model.info.module}, failed loading with {MODULE_NAME}." ) model_file = model.path_of(f"{SAVE_NAMESPACE}{ONNX_EXT}") if backend not in SUPPORTED_ONNX_BACKEND: raise BentoMLException( f"'{backend}' runtime is currently not supported for ONNXModel" ) if providers: if not all(i in ort.get_all_providers() for i in flatten_list(providers)): raise BentoMLException(f"'{providers}' cannot be parsed by `onnxruntime`") else: providers = ort.get_available_providers() return ort.InferenceSession( model_file, sess_options=session_options, providers=providers, )
def testSetProviders(self): if 'CUDAExecutionProvider' in onnxrt.get_available_providers(): sess = onnxrt.InferenceSession(get_name("mul_1.onnx")) # confirm that CUDA Provider is in list of registered providers. self.assertTrue('CUDAExecutionProvider' in sess.get_providers()) # reset the session and register only CPU Provider. sess.set_providers(['CPUExecutionProvider']) # confirm only CPU Provider is registered now. self.assertEqual(['CPUExecutionProvider'], sess.get_providers())
def optimize_by_onnxruntime( onnx_model_path: str, use_gpu: bool = False, optimized_model_path: Optional[str] = None, opt_level: Optional[int] = 99, disabled_optimizers=[], ) -> str: """ Use onnxruntime to optimize model. Args: onnx_model_path (str): the path of input onnx model. use_gpu (bool): whether the optimized model is targeted to run in GPU. optimized_model_path (str or None): the path of optimized model. opt_level (int): graph optimization level. disabled_optimizers (List[str]): a list of names of disabled optimizers Returns: optimized_model_path (str): the path of optimized model """ assert opt_level in [1, 2, 99] import onnxruntime if use_gpu and "CUDAExecutionProvider" not in onnxruntime.get_available_providers(): logger.error("There is no gpu for onnxruntime to do optimization.") return onnx_model_path sess_options = onnxruntime.SessionOptions() if opt_level == 1: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC elif opt_level == 2: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED else: sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL if optimized_model_path is None: path_prefix = onnx_model_path[:-5] # remove .onnx suffix optimized_model_path = "{}_o{}_{}.onnx".format(path_prefix, opt_level, "gpu" if use_gpu else "cpu") sess_options.optimized_model_filepath = optimized_model_path kwargs = {} if disabled_optimizers: kwargs["disabled_optimizers"] = disabled_optimizers if not use_gpu: session = onnxruntime.InferenceSession( onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs ) else: session = onnxruntime.InferenceSession( onnx_model_path, sess_options, providers=["CUDAExecutionProvider"], **kwargs ) assert "CUDAExecutionProvider" in session.get_providers() # Make sure there is GPU assert os.path.exists(optimized_model_path) and os.path.isfile(optimized_model_path) logger.debug("Save optimized model by onnxruntime to {}".format(optimized_model_path)) return optimized_model_path
def _check_providers(providers): providers = providers or [] if not isinstance(providers, (list, tuple)): providers = [providers] available_providers = onnxruntime.get_available_providers() unavailable = set(providers) - set(available_providers) if unavailable: raise RuntimeError(f"Unavailable providers {unavailable}") return providers
def perf_test(rnn_type, num_threads, input_dim, hidden_dim, bidirectional, layers, seq_len, batch_size, top_n=5, min_duration_seconds=10): model_name = '{}_i{}_h{}_{}_l{}_{}.onnx'.format(rnn_type, input_dim, hidden_dim, 'bi' if bidirectional else '', layers, 'batched' if batch_size > 1 else 'no_batch') generate_model(rnn_type, input_dim, hidden_dim, bidirectional, layers, model_name, batch_size == 1) feeds = {'input':np.random.rand(seq_len, batch_size, input_dim).astype(np.float32)} # run original model in CPU provider, using all threads # there are some local thread pool inside LSTM/GRU CPU kernel # that cannot be controlled by OMP or intra_op_num_threads sess = onnxruntime.InferenceSession(model_name, providers=['CPUExecutionProvider']) count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds) avg_rnn = top_n_avg(per_iter_cost, top_n) print('perf_rnn (with default threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(model_name, count, top_n, avg_rnn)) # run converted model in Nuphar, using specified threads with ScopedSetNumThreads(num_threads) as scoped_set_num_threads: # run Scan model converted from original in Nuphar from .model_editor import convert_to_scan_model from ..tools.symbolic_shape_infer import SymbolicShapeInference scan_model_name = os.path.splitext(model_name)[0] + '_scan.onnx' convert_to_scan_model(model_name, scan_model_name) # note that symbolic shape inference is needed because model has symbolic batch dim, thus init_state is ConstantOfShape onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(scan_model_name)), scan_model_name) sess = onnxruntime.InferenceSession(scan_model_name, providers=onnxruntime.get_available_providers()) count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds) avg_scan = top_n_avg(per_iter_cost, top_n) print('perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, scan_model_name, count, top_n, avg_scan)) # quantize Scan model to int8 and run in Nuphar from .model_quantizer import convert_matmul_model int8_model_name = os.path.splitext(model_name)[0] + '_int8.onnx' convert_matmul_model(scan_model_name, int8_model_name) onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(int8_model_name)), int8_model_name) sess = onnxruntime.InferenceSession(int8_model_name, providers=onnxruntime.get_available_providers()) count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds) avg_int8 = top_n_avg(per_iter_cost, top_n) print('perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, int8_model_name, count, top_n, avg_int8)) return avg_rnn, avg_scan, avg_int8
def convert_onnx_models_to_ort(): args = parse_args() model_path_or_dir = args.model_path_or_dir.resolve() custom_op_library = args.custom_op_library.resolve( ) if args.custom_op_library else None if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file(): raise FileNotFoundError( "Model path '{}' is not a file or directory.".format( model_path_or_dir)) if custom_op_library and not custom_op_library.is_file(): raise FileNotFoundError( "Unable to find custom operator library '{}'".format( custom_op_library)) if args.use_nnapi and 'NnapiExecutionProvider' not in ort.get_available_providers( ): raise ValueError( 'The NNAPI Execution Provider was not included in this build of ONNX Runtime.' ) if args.use_coreml: if not is_macOS(): # Check if the script is run on a Mac Device in this case raise ValueError( '--use_coreml option requires a MacOS environment.') if 'CoreMLExecutionProvider' not in ort.get_available_providers(): raise ValueError( 'The CoreML Execution Provider was not included in this build of ONNX Runtime.' ) _convert(model_path_or_dir, args.optimization_level, args.use_nnapi, args.use_coreml, custom_op_library, args.save_optimized_onnx_model, args.allow_conversion_failures) _create_config_file_from_ort_models(model_path_or_dir, args.optimization_level, args.enable_type_reduction)