def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None, save=None, opset=14, repeat_profile=1500, verbose=1): if verbose: print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d" " opset=%d." % (repeat, number, repeat_profile, opset)) res = [] for dim in tqdm([ 8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000, 1024, 1200 ]): shape, slices = shape_slice_fct(dim) onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save, op_version=opset, slices=slices) n_arrays = 20 if dim >= 512: n_arrays = 10 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] info = dict(shape=shape) ctx = dict(xs=xs, loop_fct=loop_fct) # numpy ctx['fct'] = npy_fct obs = measure_time(lambda: loop_fct(npy_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time(lambda: loop_fct(ort_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) if ort_fct_gpu is not None: # onnxruntime dev = get_ort_device('cuda:0') ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs] ctx['fct'] = ort_fct_gpu obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_gpu' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # profiling CPU if verbose: print("[benchmark_op] done.") print("[benchmark_op] profile CPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"]) for i in range(0, repeat_profile): sess.run( None, {'X': xs[-1]}, ) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprof = DataFrame(OnnxWholeSession.process_profiling(js)) dfprof['shape'] = ",".join(map(str, shape)) dfprof['slices'] = str(slices) if verbose: print("[benchmark_op] done.") # profiling CPU if ort_fct_gpu is not None: if verbose: print("[benchmark_op] profile GPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CUDAExecutionProvider"]) io_binding = sess.io_binding()._iobinding device = get_ort_device('cpu') for i in range(0, repeat_profile): x = ctx['xs'][-1] io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) sess._sess.run_with_iobinding(io_binding, None) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js)) dfprofgpu['shape'] = ",".join(map(str, shape)) dfprofgpu['slices'] = str(slices) if verbose: print("[benchmark_op] profile done.") else: dfprofgpu = None # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) piv = df.pivot('shape', 'fct', 'average') rs = piv.copy() for c in ['numpy', 'ort', 'ort_gpu']: if c in rs.columns: rs[f"numpy/{c}"] = rs['numpy'] / rs[c] rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy() # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title=f"{name} benchmark\n{shape_name!r} lower better") ax[0].legend(prop={"size": 9}) rs.plot( logx=True, logy=True, ax=ax[1], title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better") ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return dfprof, dfprofgpu, df, rs, ax
class InferenceSession: # pylint: disable=E0102 """ Wrappers around InferenceSession from :epkg:`onnxruntime`. :param onnx_bytes: onnx bytes :param session_options: session options :param log_severity_level: change the logging level :param device: device, a string `cpu`, `cuda`, `cuda:0`... """ def __init__(self, onnx_bytes, sess_options=None, log_severity_level=4, device=None): if InferenceSession is None: raise ImportError( # pragma: no cover "onnxruntime is not available.") self.log_severity_level = log_severity_level if device is None: self.device = get_ort_device('cpu') else: self.device = get_ort_device(device) self.providers = device_to_providers(self.device) set_default_logger_severity(3) if sess_options is None: self.so = SessionOptions() self.so.log_severity_level = log_severity_level self.sess = OrtInferenceSession(onnx_bytes, sess_options=self.so, providers=self.providers) else: self.so = sess_options self.sess = OrtInferenceSession(onnx_bytes, sess_options=sess_options, providers=self.providers) self.ro = RunOptions() self.ro.log_severity_level = log_severity_level self.ro.log_verbosity_level = log_severity_level self.output_names = [o.name for o in self.get_outputs()] def run(self, output_names, input_feed, run_options=None): """ Executes the ONNX graph. :param output_names: None for all, a name for a specific output :param input_feed: dictionary of inputs :param run_options: None or RunOptions :return: array """ if any(map(lambda v: isinstance(v, C_OrtValue), input_feed.values())): return self.sess._sess.run_with_ort_values(input_feed, self.output_names, run_options or self.ro) return self.sess.run(output_names, input_feed, run_options or self.ro) def get_inputs(self): "Returns input types." return self.sess.get_inputs() def get_outputs(self): "Returns output types." return self.sess.get_outputs() def end_profiling(self): "Ends profiling." return self.sess.end_profiling()
bind.bind_input(name, ort_device, dtype, value.shape(), value.data_ptr()) for out in outputs: bind.bind_output(out, ort_device) sess._sess.run_with_iobinding(bind, None) ortvalues = bind.get_outputs() return [o.numpy() for o in ortvalues] ####################################### # The profiling. for i in tqdm(range(0, 10)): run_with_iobinding(sess, bind, ort_device, feed_ort_value, outputs) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) df = pandas.DataFrame(OnnxWholeSession.process_profiling(js)) df ################################### # First graph is by operator type. gr_dur = df[['dur', "args_op_name"]].groupby("args_op_name").sum().sort_values('dur') total = gr_dur['dur'].sum() gr_dur /= total gr_n = df[['dur', "args_op_name"]].groupby("args_op_name").count().sort_values('dur') gr_n = gr_n.loc[gr_dur.index, :]
def latency(model, law='normal', size=1, number=10, repeat=10, max_time=0, runtime="onnxruntime", device='cpu', profiling=None): """ Measures the latency of a model (python API). :param model: ONNX graph :param law: random law used to generate fake inputs :param size: batch size, it replaces the first dimension of every input if it is left unknown :param number: number of calls to measure :param repeat: number of times to repeat the experiment :param max_time: if it is > 0, it runs as many time during that period of time :param runtime: available runtime :param device: device, `cpu`, `cuda:0` :param profiling: if True, profile the execution of every node, if can be sorted by name or type, the value for this parameter should e in `(None, 'name', 'type')`, :return: dictionary or a tuple (dictionary, dataframe) if the profiling is enable .. cmdref:: :title: Measures model latency :cmd: -m mlprodict latency --help :lid: l-cmd-latency The command generates random inputs and call many times the model on these inputs. It returns the processing time for one iteration. Example:: python -m mlprodict latency --model "model.onnx" """ if isinstance(model, str) and not os.path.exists(model): raise FileNotFoundError( # pragma: no cover "Unable to find model %r." % model) if profiling not in (None, '', 'name', 'type'): raise ValueError("Unexpected value for profiling: %r." % profiling) size = int(size) number = int(number) repeat = int(repeat) if max_time in (None, 0, ""): max_time = None else: max_time = float(max_time) if max_time <= 0: max_time = None if law != "normal": raise ValueError("Only law='normal' is supported, not %r." % law) if device in ('cpu', 'CPUExecutionProviders'): providers = ['CPUExecutionProviders'] elif device in ('cuda:0', 'CUDAExecutionProviders'): if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = ['CUDAExecutionProviders'] elif ',' in device: if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = device.split(',') allp = set(get_all_providers()) for p in providers: if p not in allp: raise ValueError( "One device or provider %r is not supported among %r." "" % (p, allp)) else: raise ValueError( # pragma no cover "Device %r not supported." % device) if runtime == "onnxruntime": if profiling in ('name', 'type'): so = SessionOptions() so.enable_profiling = True sess = InferenceSession(model, sess_options=so) else: sess = InferenceSession(model) fct = lambda feeds: sess.run(None, feeds) inputs = sess.get_inputs() else: if profiling in ('name', 'type'): runtime_options = {"enable_profiling": True} if runtime != 'onnxruntime1': raise NotImplementedError( # pragma: no cover "Profiling is not implemented for runtime=%r." % runtime) else: runtime_options = None oinf = OnnxInference(model, runtime=runtime, runtime_options=runtime_options) fct = lambda feeds: oinf.run(feeds) inputs = oinf.obj.graph.input feeds = random_feed(inputs, size) res = measure_time(lambda: fct(feeds), number=number, repeat=repeat, context={}, max_time=max_time, div_by_number=True) for k, v in feeds.items(): res["shape(%s)" % k] = "x".join(map(str, v.shape)) if profiling in ('name', 'type'): if runtime == 'onnxruntime': profile_name = sess.end_profiling() with open(profile_name, 'r', encoding='utf-8') as f: js = json.load(f) js = OnnxWholeSession.process_profiling(js) df = DataFrame(js) else: df = oinf.get_profiling(as_df=True) if profiling == 'name': gr = df[['dur', "args_op_name", "name"]].groupby(["args_op_name", "name"]).sum().sort_values('dur') else: gr = df[['dur', "args_op_name" ]].groupby("args_op_name").sum().sort_values('dur') return res, gr return res