def create_onnx_session(self, onnx_model_path, provider='CPUExecutionProvider'): """ ``` Creates ONNX inference session from provided onnx_model_path ``` """ from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 0 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(onnx_model_path, options, providers=[provider]) session.disable_fallback() #if 'OMP_NUM_THREADS' not in os.environ or 'OMP_WAIT_POLICY' not in os.environ: #warnings.warn('''We recommend adding the following at top of script for CPU inference: #from psutil import cpu_count ##Constants from the performance optimization available in onnxruntime ##It needs to be done before importing onnxruntime #os.environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True)) #os.environ["OMP_WAIT_POLICY"] = 'ACTIVE' #''') return session
def onnx_runtime_inference(onnx_model_path, tokenizer, sentence1_list, sentence2_list, batch_size=None): # load onnx_model # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() # options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CUDA backend ort_session = InferenceSession( onnx_model_path, options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) tokens = tokenizer(sentence1_list, sentence2_list, padding=True, truncation="longest_first", max_length=64, return_tensors='pt') if batch_size is None: ort_inputs = {k: to_numpy(v) for k, v in tokens.items()} ort_outs = ort_session.run(None, ort_inputs)[0] else: batches = onnx_batch_generator(tokens, batch_size=batch_size) ort_outs = np.vstack([ort_session.run(None, b)[0] for b in batches]) return ort_outs
def create_ort_session(onnx_model_path, use_gpu=True): from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL sess_options.intra_op_num_threads = 2 sess_options.log_severity_level = 2 execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] return InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False, use_dml=False): session = None try: from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") if use_gpu: if use_dml: execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider'] else: execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] else: execution_providers = ['CPUExecutionProvider'] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def create_ort_session(model_path, use_gpu): from onnxruntime import SessionOptions, InferenceSession, __version__ as ort_version, GraphOptimizationLevel sess_options = SessionOptions() sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider' ] if use_gpu else ['CPUExecutionProvider'] ort_session = InferenceSession(model_path, sess_options, providers=execution_providers) return ort_session
def optimize_model(model_path: Path, opt_model_path: Path): """ Generate model that applies graph optimization (constant folding, etc.) parameter model_path: path to the original onnx model parameter opt_model_path: path to the optimized onnx model :return: optimized onnx model """ sess_option = SessionOptions() sess_option.optimized_model_filepath = opt_model_path.as_posix() sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC _ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"])
def create_onnx_session(onnx_model_path): provider = 'CPUExecutionProvider' from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" options = SessionOptions() options.intra_op_num_threads = 0 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL session = InferenceSession(onnx_model_path, options, providers=[provider]) session.disable_fallback() return session
def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, verbose=False): session = None try: from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug( f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) elif (not use_gpu) and (version.parse(onnxruntime_version) < version.parse('1.3.0')): # Set intra_op_num_threads = 1 to enable OpenMP for onnxruntime 1.2.0 (cpu) # onnxruntime-gpu is not built with openmp so it is better to use default (0) or cpu_count instead. sess_options.intra_op_num_threads = 1 if verbose: sess_options.log_severity_level = 0 logger.debug(f"Create session for onnx model: {onnx_model_path}") execution_providers = ['CPUExecutionProvider'] if not use_gpu else [ 'CUDAExecutionProvider', 'CPUExecutionProvider' ] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def create_model_for_provider( model_path: str, provider: str = 'CPUExecutionProvider') -> InferenceSession: assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = int(os.environ.get('NUM_THREADS', 4)) options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def optimize_model(model_path: Path): ''' Generate model that applies graph optimization (constant folding,etc.) parameter model_path: path to the original onnx model return: optimized onnx model ''' opt_model_path = generate_identified_filename(model_path, "-opt") sess_option = SessionOptions() sess_option.optimized_model_filepath = opt_model_path.as_posix() sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC _ = InferenceSession(model_path.as_posix(), sess_option) optimized_model = onnx.load(opt_model_path.as_posix()) return optimized_model
def create_ort_session(model_path, use_gpu): from onnxruntime import SessionOptions, InferenceSession, __version__ as ort_version, GraphOptimizationLevel, get_available_providers sess_options = SessionOptions() sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider'] if use_gpu: if 'CUDAExecutionProvider' not in get_available_providers(): raise RuntimeError("CUDAExecutionProvider is not avaiable for --use_gpu!") else: print("use CUDAExecutionProvider") ort_session = InferenceSession(model_path, sess_options, providers=execution_providers) return ort_session
def create_model_for_provider(self): assert self.provider in get_all_providers(), f"provider {self.provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(self.model_path, options, providers=[self.provider]) session.disable_fallback() return session
def __init__(self, *args, **kwargs): "Overwrites the constructor." runtime_options = kwargs.pop('runtime_options', {}) disable_optimisation = runtime_options.pop('disable_optimisation', False) if disable_optimisation: if 'sess_options' in kwargs: raise RuntimeError( "Incompatible options, 'disable_options' and 'sess_options' cannot " "be sepcified at the same time.") kwargs['sess_options'] = SessionOptions() kwargs['sess_options'].graph_optimization_level = ( GraphOptimizationLevel.ORT_DISABLE_ALL) self.sess, self.outi, self.erri = _capture_output( lambda: InferenceSession(*args, **kwargs), 'c')
def check_outputs(self, model, model_onnx, Xtest, predict_attributes, decimal=5, skip_if_float32=False, disable_optimisation=True): if "TransposeScaleMatMul" in str(model_onnx): raise RuntimeError("This node must not be added.") if predict_attributes is None: predict_attributes = {} exp = model.predict(Xtest, **predict_attributes) if disable_optimisation and GraphOptimizationLevel is not None: opts = SessionOptions() opts.graph_optimization_level = ( GraphOptimizationLevel.ORT_DISABLE_ALL) sess = InferenceSession(model_onnx.SerializeToString(), sess_options=opts) else: sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'X': Xtest}) if isinstance(exp, tuple): if len(exp) != len(got): raise AssertionError("Mismatched number of outputs.") for i, (e, g) in enumerate(zip(exp, got)): if skip_if_float32 and g.dtype == np.float32: continue try: assert_almost_equal(self.remove_dim1(e), self.remove_dim1(g), decimal=decimal) except AssertionError as e: # noqa raise AssertionError( "Mismatch for output {} and attributes {}" ".".format(i, predict_attributes)) from e else: if skip_if_float32 and Xtest.dtype == np.float32: return assert_almost_equal(np.squeeze(exp), np.squeeze(got), decimal=decimal)
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: """ 这里解释一下ExecutionProvider,ONNXRuntime用Provider表示不同的运行设备比如CUDAProvider等。 目前ONNX Runtime v1.0支持了包括CPU,CUDA,TensorRT,MKL等七种Providers。 :param model_path: :param provider: :return: """ assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider(model_path: str, provider: str, optimization_level: str) -> InferenceSession: assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 if optimization_level in GRAPH_OPTIMIZATIONS: options.graph_optimization_level = GRAPH_OPTIMIZATIONS[ optimization_level] else: raise KeyError( f"Unknown Optimization Level {optimization_level} (Available optimization level are all/disable_all/basic/extended)" ) # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def __init__(self, onnx_data, runtime, runtime_options=None): """ @param onnx_data :epkg:`ONNX` model or data @param runtime runtime to be used, mostly :epkg:`onnxruntime` @param runtime_options runtime options """ if runtime != 'onnxruntime1': raise NotImplementedError( # pragma: no cover "runtime '{}' is not implemented.".format(runtime)) if hasattr(onnx_data, 'SerializeToString'): onnx_data = onnx_data.SerializeToString() self.runtime = runtime sess_options = SessionOptions() self.run_options = RunOptions() try: sess_options.sessions_log_verbosity_level = 0 except AttributeError: # pragma: no cover # onnxruntime not recent enough. pass try: self.run_options.run_log_verbosity_level = 0 except AttributeError: # pragma: no cover # onnxruntime not recent enough. pass if (runtime_options is not None and runtime_options.get('disable_optimisation', False)): sess_options.graph_optimization_level = ( GraphOptimizationLevel.ORT_ENABLE_ALL) try: self.sess = InferenceSession(onnx_data, sess_options=sess_options) except (OrtFail, OrtNotImplemented, OrtInvalidGraph, OrtInvalidArgument, OrtRuntimeException, RuntimeError) as e: raise RuntimeError( "Unable to create InferenceSession due to '{}'\n{}.".format( e, display_onnx(onnx.load(BytesIO(onnx_data))))) from e
def get_onnx_runtime_sessions( model_paths, default: bool = True, opt_level: int = 99, parallel_exe_mode: bool = True, n_threads: int = 4, provider=[ 'CPUExecutionProvider', ], ) -> InferenceSession: ''' Optimizes the model Args: path_to_encoder (str) : the path of input onnx encoder model. path_to_decoder (str) : the path of input onnx decoder model. path_to_initial_decoder (str) : the path of input initial onnx decoder model. opt_level (int) : sess_options.GraphOptimizationLevel param if set 1 uses 'ORT_ENABLE_BASIC', 2 for 'ORT_ENABLE_EXTENDED' and 99 for 'ORT_ENABLE_ALL', default value is set to 99. parallel_exe_mode (bool) : Sets the execution mode. Default is parallel. n_threads (int) : Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose provider : execution providers list. default : set this to true, ort will choose the best settings for your hardware. (you can test out different settings for better results.) Returns: encoder_session : encoder onnx InferenceSession decoder_session : decoder onnx InferenceSession decoder_sess_init : initial decoder onnx InferenceSession ''' path_to_encoder, path_to_decoder, path_to_initial_decoder = model_paths if default: encoder_sess = InferenceSession(str(path_to_encoder)) decoder_sess = InferenceSession(str(path_to_decoder)) decoder_sess_init = InferenceSession(str(path_to_initial_decoder)) else: # Few properties that might have an impact on performances options = SessionOptions() if opt_level == 1: options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC elif opt_level == 2: options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_EXTENDED else: assert opt_level == 99 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # set this true for better performance if parallel_exe_mode == True: options.execution_mode = ExecutionMode.ORT_PARALLEL else: options.execution_mode = ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = n_threads # options.inter_op_num_threads = 10 # options.enable_profiling = True encoder_sess = InferenceSession(str(path_to_encoder), options, providers=provider) decoder_sess = InferenceSession(str(path_to_decoder), options, providers=provider) decoder_sess_init = InferenceSession(str(path_to_initial_decoder), options, providers=provider) return encoder_sess, decoder_sess, decoder_sess_init
def create_onnxruntime_session( onnx_model_path, use_gpu, provider=None, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False, provider_options={}, # map execution provider name to its option ): session = None try: from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug( f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") if use_gpu: if provider == "dml": providers = ["DmlExecutionProvider", "CPUExecutionProvider"] elif provider == "rocm": providers = ["ROCMExecutionProvider", "CPUExecutionProvider"] elif provider == "migraphx": providers = [ "MIGraphXExecutionProvider", "ROCMExecutionProvider", "CPUExecutionProvider", ] elif provider == "cuda": providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] elif provider == "tensorrt": providers = [ "TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider", ] else: providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: providers = ["CPUExecutionProvider"] if provider_options: providers = [ (name, provider_options[name]) if name in provider_options else name for name in providers ] session = InferenceSession(onnx_model_path, sess_options, providers=providers) except: logger.error(f"Exception", exc_info=True) return session
def _init(self, variables=None): """ Initializes the node. @param variables registered variables created by previous operators The current implementation for operator *Scan* only works for matrices. """ try: self.alg_class = getattr(alg2, 'Onnx' + self.onnx_node.op_type) except AttributeError: self.alg_class = getattr(alg, 'Onnx' + self.onnx_node.op_type) inputs = list(self.onnx_node.input) self.mapping, self.inputs = self._name_mapping(inputs) self.outputs = list(self.onnx_node.output) options = self.options.copy() target_opset = options.pop('target_opset', None) domain = options.pop('domain', None) disable_optimisation = options.pop('disable_optimisation', False) ir_version = options.pop('ir_version', None) if domain == '' and target_opset < 9: # target_opset should be >= 9 not {} for main domain. # We assume it was the case when the graph was created. pass if self.onnx_node.op_type == 'ConstantOfShape': for k in options: v = options[k] if isinstance(v, numpy.ndarray): options[k] = make_tensor(k, self._guess_proto_type(v.dtype), v.shape, v.tolist()) self.inst_ = self.alg_class(*self.inputs, output_names=self.outputs, op_version=target_opset, **options) inputs = get_defined_inputs(self.inputs, variables, dtype=self.dtype) try: self.onnx_ = self.inst_.to_onnx(inputs, target_opset=target_opset, domain=domain) if "dim_value: 0" in str(self.onnx_): raise RuntimeError( # pragma: no cover "Probable issue as one dimension is null.\n--\n{}". format(self.onnx_)) except AttributeError as e: # pragma: no cover # older version of skl2onnx self.onnx_ = self.inst_.to_onnx(inputs) if "dim_value: 0" in str(self.onnx_): raise RuntimeError( "Probable issue as one dimension is null.\n--\n{}". format(self.onnx_)) from e forced = False elif self.onnx_node.op_type == 'Scan': self.inst_ = self.alg_class(*self.inputs, output_names=self.outputs, op_version=target_opset, **options) inputs = get_defined_inputs(self.inputs, variables, dtype=self.dtype) outputs = get_defined_outputs(self.outputs, self.onnx_node, inputs, variables, dtype=self.dtype) inputs = [(name, cl.__class__([None, None])) for (name, cl) in inputs] outputs = [(name, cl.__class__([None, None])) for (name, cl) in outputs] self.onnx_ = self.inst_.to_onnx(inputs, outputs=outputs, target_opset=target_opset, domain=domain) if "dim_value: 0" in str(self.onnx_): raise RuntimeError( # pragma: no cover "Probable issue as one dimension is null.\n--\n{}".format( self.onnx_)) forced = True else: self.inst_ = self.alg_class(*self.inputs, output_names=self.outputs, op_version=target_opset, domain=domain, **options) inputs = get_defined_inputs(self.inputs, variables, dtype=self.dtype) try: self.onnx_ = self.inst_.to_onnx(inputs, target_opset=target_opset, domain=domain) if "dim_value: 0" in str(self.onnx_): raise RuntimeError( # pragma: no cover "Probable issue as one dimension is null.\n--\n{}\n---\n{}" .format(self.onnx_, inputs)) forced = False except (RuntimeError, ValueError): # Let's try again by forcing output types. forced = True outputs = get_defined_outputs(self.outputs, self.onnx_node, inputs, variables, dtype=self.dtype) self.onnx_ = self.inst_.to_onnx(inputs, outputs=outputs, target_opset=target_opset, domain=domain) if "dim_value: 0" in str(self.onnx_): raise RuntimeError( # pragma: no cover "Probable issue as one dimension is null.\n--\n{}". format(self.onnx_)) from e if len(self.onnx_.graph.output) != len(self.outputs): # Something is wrong, falls back to default plan. forced = True outputs = get_defined_outputs(self.outputs, self.onnx_node, inputs, variables, dtype=self.dtype) self.onnx_ = self.inst_.to_onnx(inputs, outputs=outputs, target_opset=target_opset, domain=domain) if "dim_value: 0" in str(self.onnx_): raise RuntimeError( "Probable issue as one dimension is null.\n--\n{}".format( self.onnx_)) else: lo = list(self.onnx_.graph.output) outputs = proto2vars(lo) sess_options = SessionOptions() self.run_options = RunOptions() try: sess_options.session_log_severity_level = 3 # sess_options.sessions_log_verbosity_level = 0 except AttributeError: # onnxruntime not recent enough. pass try: self.run_options.run_log_severity_level = 3 # self.run_options.run_log_verbosity_level = 0 except AttributeError: # onnxruntime not recent enough. pass if ir_version is not None: self.onnx_.ir_version = ir_version if disable_optimisation: sess_options.graph_optimization_level = ( GraphOptimizationLevel.ORT_DISABLE_ALL) try: self.sess_ = InferenceSession(self.onnx_.SerializeToString(), sess_options=sess_options) except (RuntimeError, OrtNotImplemented, OrtInvalidGraph, OrtFail) as e: raise RuntimeError( "Unable to load node '{}' (output type was {})\n{}".format( self.onnx_node.op_type, "guessed" if forced else "inferred", self.onnx_)) from e self.typed_outputs_ = outputs
############################### # Create of the session. data = [] files = [] legend = [] for graph_opt, name_opt in tqdm([ (GraphOptimizationLevel.ORT_DISABLE_ALL, "ORT_DISABLE_ALL"), (GraphOptimizationLevel.ORT_ENABLE_BASIC, "ORT_ENABLE_BASIC"), (GraphOptimizationLevel.ORT_ENABLE_EXTENDED, "ORT_ENABLE_EXTENDED"), (GraphOptimizationLevel.ORT_ENABLE_ALL, "ORT_ENABLE_ALL") ]): so = SessionOptions() so.graph_optimization_level = graph_opt so.optimized_model_filepath = (os.path.split(filename)[-1] + f".optimized.{name_opt}.onnx") files.append(so.optimized_model_filepath) legend.append(name_opt) sess = InferenceSession(onx.SerializeToString(), so, providers=[provider]) bind = SessionIOBinding(sess._sess) ##################################### # Creates random data feed = random_feed(sess, batch) ##################################### # moving the data on CPU or GPU feed_ort_value = OrderedDict( (name, (C_OrtValue.ortvalue_from_numpy(v, ort_device), v.dtype))