def test1(model, X_test, number, repeat, name, **data): res = measure_time('model.predict(X_test)', div_by_number=True, number=number, repeat=repeat, context={ 'X_test': X_test, 'model': model }) res['name'] = name res['runtime'] = 'skl' res['version'] = skl_version res['batch'] = "y" res.update(data) return res
def test5(oinf, X_test, number, repeat, name, **data): res = measure_time('loop_model2(oinf, X_test)', div_by_number=True, number=number, repeat=repeat, context={ 'oinf': oinf, 'X_test': X_test, 'loop_model2': loop_model2 }) res['name'] = name res['runtime'] = 'mlprodict' res['version'] = pyrt_version res['batch'] = "n" res.update(data) return res
def test4(oinf, X_test, number, repeat, name, **data): input = {'input': X_test} res = measure_time('oinf.run({"X": X_test})', div_by_number=True, number=number, repeat=repeat, context={ 'oinf': oinf, 'X_test': X_test }) res['name'] = name res['runtime'] = 'mlprodict' res['version'] = pyrt_version res['batch'] = "y" res.update(data) return res
def test3(sess, X_test, number, repeat, name, **data): res = measure_time('loop_model(sess, X_test)', div_by_number=True, number=number, repeat=repeat, context={ 'sess': sess, 'X_test': X_test, 'loop_model': loop_model }) res['name'] = name res['runtime'] = 'onnxruntime' res['version'] = ort_version res['batch'] = "n" res.update(data) return res
def test2(sess, X_test, number, repeat, name, **data): input = {'input': X_test} res = measure_time('sess.run(None, {"X": X_test})', div_by_number=True, number=number, repeat=repeat, context={ 'sess': sess, 'X_test': X_test }) res['name'] = name res['runtime'] = 'onnxruntime' res['version'] = ort_version res['batch'] = "y" res.update(data) return res
def _measure_time(stmt, *x, repeat=5, number=5, div_by_number=True, first_run=True, max_time=None): """ Measures a statement and returns the results as a dictionary. :param stmt: string :param *x: inputs :param repeat: average over *repeat* experiment :param number: number of executions in one row :param div_by_number: divide by the number of executions :param first_run: if True, runs the function once before measuring :param max_time: execute the statement until the total goes beyond this time (approximatively), *repeat* is ignored, *div_by_number* must be set to True :return: dictionary See `Timer.repeat <https://docs.python.org/3/library/timeit.html?timeit.Timer.repeat>`_ for a better understanding of parameter *repeat* and *number*. The function returns a duration corresponding to *number* times the execution of the main statement. """ if first_run: try: stmt(*x) except RuntimeError as e: # pragma: no cover raise RuntimeError("{}-{}".format(type(x), x.dtype)) from e def fct(): stmt(*x) if first_run: fct() return measure_time(fct, context={}, repeat=repeat, number=number, div_by_number=div_by_number, max_time=max_time)
def benchmark_op(axes, repeat=5, number=5, name="ReduceSum", shape_fct=None, custom_impl=False): if shape_fct is None: def shape_fct(dim): return (3, dim, 1, 128, 64) ort_fct = build_ort_reducesum(axes) res = [] for dim in tqdm([8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 1024]): shape = shape_fct(dim) n_arrays = 10 if dim < 512 else 4 xs = [numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays)] ys = [numpy.array(axes, dtype=numpy.int64) for _ in range(n_arrays)] info = dict(axes=axes, shape=shape) # numpy ctx = dict( xs=xs, ys=ys, fct=lambda x, y: numpy.sum(x, *y), loop_fct=loop_fct) obs = measure_time( "loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time( "loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs.update(info) res.append(obs) if custom_impl: if axes != (0, ): raise RuntimeError( "Unexpected axes=%r." % axes) ctx['fct'] = lambda x, y: custom_reducesum_rk_float(x) ctx['xs'] = [x.reshape((x.shape[0], -1)).copy() for x in xs] obs = measure_time( "loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'custom' obs.update(info) res.append(obs) if tf_reduce_sum is not None: # tensorflow ctx['fct'] = tf_reduce_sum ctx['xs'] = [convert_to_tensor(x) for x in xs] ctx['ys'] = ys obs = measure_time( "loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'tf' obs.update(info) res.append(obs) if torch_sum is not None: def torch_sum1(x, y): return torch_sum(x, y[0]) def torch_sum2(x, y): return torch_sum(torch_sum(x, y[1]), y[0]) # torch ctx['fct'] = torch_sum1 if len(axes) == 1 else torch_sum2 ctx['xs'] = [from_numpy(x) for x in xs] ctx['ys'] = ys # [from_numpy(y) for y in ys] obs = measure_time( "loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'torch' obs.update(info) res.append(obs) # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) df.columns = [_.replace('dim', 'N') for _ in df.columns] piv = df.pivot('N', 'fct', 'average') rs = piv.copy() for c in ['ort', 'torch', 'tf', 'tf_copy']: if c in rs.columns: rs[c] = rs['numpy'] / rs[c] rs['numpy'] = 1. # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title="%s benchmark\n%r - %r" " lower better" % (name, shape_name, axes)) ax[0].legend(prop={"size": 9}) rs.plot(logx=True, logy=True, ax=ax[1], title="%s Speedup, baseline=numpy\n%r - %r" " higher better" % (name, shape_name, axes)) ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return df, rs, ax
def benchmark_op(axes, repeat=2, number=5, name="ReduceMean", shape_fct=None, max_dim=None): if shape_fct is None: def shape_fct(dim): return (3, dim, 1, 128, 64) ort_fct = build_ort_reducemean(axes) res = [] for dim in tqdm([4, 8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 1024]): if max_dim is not None and dim > max_dim: continue shape = shape_fct(dim) n_arrays = 10 if dim < 512 else 4 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] ys = [numpy.array(axes, dtype=numpy.int64) for _ in range(n_arrays)] info = dict(axes=axes, shape=shape) # numpy fct = lambda x, y: numpy.mean(x, axis=tuple(y)) ctx = dict(xs=xs, ys=ys, loop_fct=loop_fct) obs = measure_time(lambda: loop_fct(fct, xs, ys), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs.update(info) res.append(obs) # onnxruntime fct = ort_fct obs = measure_time(lambda: loop_fct(fct, xs, ys), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs.update(info) res.append(obs) if tf_reduce_mean is not None: # tensorflow fct = tf_reduce_mean ctx['xs'] = [convert_to_tensor(x) for x in xs] ctx['ys'] = ys obs = measure_time(lambda: loop_fct(fct, ctx['xs'], ctx['ys']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'tf' obs.update(info) res.append(obs) if torch_mean is not None: def torch_mean1(x, y): return torch_mean(x, y[0]) def torch_mean2(x, y): return torch_mean(torch_mean(x, y[1]), y[0]) # torch fct = torch_mean1 if len(axes) == 1 else torch_mean2 ctx['xs'] = [from_numpy(x) for x in xs] ctx['ys'] = ys # [from_numpy(y) for y in ys] obs = measure_time(lambda: loop_fct(fct, ctx['xs'], ctx['ys']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'torch' obs.update(info) res.append(obs) # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) df.columns = [_.replace('dim', 'N') for _ in df.columns] piv = df.pivot('N', 'fct', 'average') rs = piv.copy() for c in ['ort', 'torch', 'tf', 'tf_copy']: if c in rs.columns: rs[c] = rs['numpy'] / rs[c] rs['numpy'] = 1. # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title="%s benchmark\n%r - %r" " lower better" % (name, shape_name, axes)) ax[0].legend(prop={"size": 9}) rs.plot(logx=True, logy=True, ax=ax[1], title="%s Speedup, baseline=numpy\n%r - %r" " higher better" % (name, shape_name, axes)) ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return df, rs, ax
def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None, save=None, opset=14, repeat_profile=1500, verbose=1): if verbose: print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d" " opset=%d." % (repeat, number, repeat_profile, opset)) res = [] for dim in tqdm([ 8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000, 1024, 1200 ]): shape, slices = shape_slice_fct(dim) onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save, op_version=opset, slices=slices) n_arrays = 20 if dim >= 512: n_arrays = 10 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] info = dict(shape=shape) ctx = dict(xs=xs, loop_fct=loop_fct) # numpy ctx['fct'] = npy_fct obs = measure_time(lambda: loop_fct(npy_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time(lambda: loop_fct(ort_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) if ort_fct_gpu is not None: # onnxruntime dev = get_ort_device('cuda:0') ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs] ctx['fct'] = ort_fct_gpu obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_gpu' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # profiling CPU if verbose: print("[benchmark_op] done.") print("[benchmark_op] profile CPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"]) for i in range(0, repeat_profile): sess.run( None, {'X': xs[-1]}, ) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprof = DataFrame(OnnxWholeSession.process_profiling(js)) dfprof['shape'] = ",".join(map(str, shape)) dfprof['slices'] = str(slices) if verbose: print("[benchmark_op] done.") # profiling CPU if ort_fct_gpu is not None: if verbose: print("[benchmark_op] profile GPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CUDAExecutionProvider"]) io_binding = sess.io_binding()._iobinding device = get_ort_device('cpu') for i in range(0, repeat_profile): x = ctx['xs'][-1] io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) sess._sess.run_with_iobinding(io_binding, None) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js)) dfprofgpu['shape'] = ",".join(map(str, shape)) dfprofgpu['slices'] = str(slices) if verbose: print("[benchmark_op] profile done.") else: dfprofgpu = None # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) piv = df.pivot('shape', 'fct', 'average') rs = piv.copy() for c in ['numpy', 'ort', 'ort_gpu']: if c in rs.columns: rs[f"numpy/{c}"] = rs['numpy'] / rs[c] rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy() # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title=f"{name} benchmark\n{shape_name!r} lower better") ax[0].legend(prop={"size": 9}) rs.plot( logx=True, logy=True, ax=ax[1], title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better") ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return dfprof, dfprofgpu, df, rs, ax
def benchmark_equation(equation): # equations ort_einsum = build_ort_einsum(equation) ort_einsum_decomposed = build_ort_decomposed(equation) res = [] for dim in tqdm([8, 16, 32, 64, 100, 128, 200, 256, 500, 512]): xs = [ numpy.random.rand(2, dim, 12, 64).astype(numpy.float32) for _ in range(5) ] ys = [ numpy.random.rand(2, dim, 12, 64).astype(numpy.float32) for _ in range(5) ] # numpy ctx = dict(equation=equation, xs=xs, ys=ys, einsum=numpy.einsum, loop_einsum=loop_einsum, loop_einsum_eq=loop_einsum_eq, loop_einsum_eq_th=loop_einsum_eq_th) obs = measure_time("loop_einsum_eq(einsum, equation, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'numpy.einsum' res.append(obs) # opt-einsum ctx['einsum'] = contract obs = measure_time("loop_einsum_eq(einsum, equation, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'opt-einsum' res.append(obs) # onnxruntime ctx['einsum'] = ort_einsum obs = measure_time("loop_einsum(einsum, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'ort_einsum' res.append(obs) # onnxruntime decomposed ctx['einsum'] = ort_einsum_decomposed obs = measure_time("loop_einsum(einsum, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'ort_dec' res.append(obs) # custom implementation ctx['einsum'] = custom_einsum_float obs = measure_time("loop_einsum_eq_th(einsum, equation, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'c_einsum' res.append(obs) # transpose + custom implementation ctx['einsum'] = custom_einsum_float_tr obs = measure_time("loop_einsum_eq(einsum, equation, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'c_einsum_tr' res.append(obs) if tf_einsum is not None: # tensorflow ctx['einsum'] = tf_einsum ctx['xs'] = [convert_to_tensor(x) for x in xs] ctx['ys'] = [convert_to_tensor(y) for y in ys] obs = measure_time("loop_einsum_eq(einsum, equation, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'tf_einsum' res.append(obs) if torch_einsum is not None: # torch ctx['einsum'] = torch_einsum ctx['xs'] = [from_numpy(x) for x in xs] ctx['ys'] = [from_numpy(y) for y in ys] obs = measure_time("loop_einsum_eq(einsum, equation, xs, ys)", div_by_number=True, context=ctx, repeat=5, number=1) obs['dim'] = dim obs['fct'] = 'torch_einsum' res.append(obs) # Dataframes df = pandas.DataFrame(res) piv = df.pivot('dim', 'fct', 'average') rs = piv.copy() rs['c_einsum'] = rs['numpy.einsum'] / rs['c_einsum'] rs['ort_einsum'] = rs['numpy.einsum'] / rs['ort_einsum'] rs['ort_dec'] = rs['numpy.einsum'] / rs['ort_dec'] rs['opt-einsum'] = rs['numpy.einsum'] / rs['opt-einsum'] if 'c_einsum_tr' in rs.columns: rs['c_einsum_tr'] = rs['numpy.einsum'] / rs['c_einsum_tr'] if 'tf_einsum' in rs.columns: rs['tf_einsum'] = rs['numpy.einsum'] / rs['tf_einsum'] if 'torch_einsum' in rs.columns: rs['torch_einsum'] = rs['numpy.einsum'] / rs['torch_einsum'] rs['numpy.einsum'] = 1. # Graphs. fig, ax = plt.subplots(1, 2, figsize=(14, 5)) piv.plot(logx=True, logy=True, ax=ax[0], title="Einsum benchmark\n%s -- (2, N, 12, 64)" " lower better" % equation) ax[0].legend(prop={"size": 9}) rs.plot(logx=True, logy=True, ax=ax[1], title="Einsum Speedup, baseline=numpy\n%s -- (2, N, 12, 64)" " higher better" % equation) ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return df, rs, ax
def benchmark_op(repeat=5, number=2, name="Add", shape_fcts=None): if shape_fcts is None: def shape_fct(dim): return (5, dim, dim) shape_fcts = (shape_fct, shape_fct) ort_fct = build_ort_add() res = [] for dim in tqdm( [8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 1024, 1536, 2048, 2560]): shape1 = shape_fcts[0](dim) shape2 = shape_fcts[1](dim) n_arrays = (16 if dim < 512 else 4) if dim < 2048 else 4 if len(shape1) > 3: n_arrays = int(n_arrays / 4) xs = [ numpy.random.rand(*shape1).astype(numpy.float32) for _ in range(n_arrays) ] ys = [ numpy.random.rand(*shape2).astype(numpy.float32) for _ in range(n_arrays) ] info = dict(shape1=shape1, shape2=shape2) # numpy ctx = dict( xs=xs, ys=ys, fct=lambda x, y: numpy.add(numpy.add(numpy.add(x, y), y), y), loop_fct=loop_fct) obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs.update(info) res.append(obs) if tf_add is not None: # tensorflow ctx['fct'] = lambda x, y: tf_add(tf_add(tf_add(x, y), y), y) ctx['xs'] = [convert_to_tensor(x) for x in xs] ctx['ys'] = [convert_to_tensor(y) for y in ys] obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'tf' obs.update(info) res.append(obs) if torch_add is not None: # torch ctx['fct'] = lambda x, y: torch_add(torch_add(torch_add(x, y), y), y) ctx['xs'] = [from_numpy(x) for x in xs] ctx['ys'] = [from_numpy(y) for y in ys] obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'torch' obs.update(info) res.append(obs) # Dataframes shape1_name = str(shape1).replace(str(dim), "N") shape2_name = str(shape2).replace(str(dim), "N") df = pandas.DataFrame(res) df.columns = [_.replace('dim', 'N') for _ in df.columns] piv = df.pivot('N', 'fct', 'average') rs = piv.copy() for c in ['ort', 'torch', 'tf']: if c in rs.columns: rs[c] = rs['numpy'] / rs[c] rs['numpy'] = 1. # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title="%s benchmark\n%s + %s" " lower better" % (name, shape1_name, shape2_name)) ax[0].legend(prop={"size": 9}) rs.plot(logx=True, logy=True, ax=ax[1], title="%s Speedup, baseline=numpy\n%s + %s" " higher better" % (name, shape1_name, shape2_name)) ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return df, rs, ax
print("profiling...") txt = profile(runlocal, pyinst_format='text') print(txt[1]) ########################################## # Now let's measure the performance the average # computation time per observations for 2 to 100 # observations. The runtime implemented in # :epkg:`mlprodict` parallizes the computation # after a given number of observations. obs = [] for N in tqdm(list(range(2, 21))): m = measure_time("oinf.run({'X': x})", { 'oinf': oinf, 'x': X32[:N] }, div_by_number=True, number=20) m['N'] = N m['RT'] = 'ONNX' obs.append(m) with config_context(assume_finite=True): m = measure_time("hgb.predict(x)", { 'hgb': hgb, 'x': X32[:N] }, div_by_number=True, number=15) m['N'] = N m['RT'] = 'SKL'
r1 = py_topk.run({'X': X}) r1 ########################### # r2 = ort_topk.run(None, {'X': X}) r2 ################################# # Some figures. bs = [] bs.append( measure_time(lambda: py_topk.run({'X': X}), context=globals(), div_by_number=True)) bs[-1]['c'] = 'py' bs[-1] ################################# # bs.append( measure_time(lambda: ort_topk.run(None, {'X': X}), context=globals(), div_by_number=True)) bs[-1]['c'] = 'or' bs[-1] #####################################
def benchmark_op(perm, repeat=5, number=5, name="Transpose", shape_fct=None): if shape_fct is None: def shape_fct(dim): return (3, dim, 1, 512) ort_fct = build_ort_transpose(perm) res = [] for dim in tqdm([8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 1024]): shape = shape_fct(dim) n_arrays = 10 if dim < 512 else 4 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] ys = [perm for _ in range(n_arrays)] equation = perm2eq(perm) info = dict(perm=perm, shape=shape) # numpy ctx = dict( xs=xs, ys=ys, fct=lambda x, y: numpy.ascontiguousarray(numpy.transpose(x, y)), loop_fct=loop_fct) obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs.update(info) res.append(obs) if tf_transpose is not None: # tensorflow ctx['fct'] = tf_transpose ctx['xs'] = [convert_to_tensor(x) for x in xs] ctx['ys'] = [convert_to_tensor(y) for y in ys] obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'tf' obs.update(info) res.append(obs) # tensorflow with copy ctx['fct'] = lambda x, y: tf_transpose(convert_to_tensor(x)).numpy( ) ctx['xs'] = xs ctx['ys'] = ys obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'tf_copy' obs.update(info) res.append(obs) if torch_einsum is not None: # torch ctx['fct'] = lambda x, y: torch_einsum(equation, x).contiguous() ctx['xs'] = [from_numpy(x) for x in xs] ctx['ys'] = ys # [from_numpy(y) for y in ys] obs = measure_time("loop_fct(fct, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'torch' obs.update(info) res.append(obs) # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) df.columns = [_.replace('dim', 'N') for _ in df.columns] piv = df.pivot('N', 'fct', 'average') rs = piv.copy() for c in ['ort', 'torch', 'tf', 'tf_copy']: if c in rs.columns: rs[c] = rs['numpy'] / rs[c] rs['numpy'] = 1. # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title="%s benchmark\n%r - %r - %s" " lower better" % (name, shape_name, perm, equation)) ax[0].legend(prop={"size": 9}) rs.plot(logx=True, logy=True, ax=ax[1], title="%s Speedup, baseline=numpy\n%r - %r - %s" " higher better" % (name, shape_name, perm, equation)) ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return df, rs, ax
def latency(model, law='normal', size=1, number=10, repeat=10, max_time=0, runtime="onnxruntime", device='cpu', profiling=None): """ Measures the latency of a model (python API). :param model: ONNX graph :param law: random law used to generate fake inputs :param size: batch size, it replaces the first dimension of every input if it is left unknown :param number: number of calls to measure :param repeat: number of times to repeat the experiment :param max_time: if it is > 0, it runs as many time during that period of time :param runtime: available runtime :param device: device, `cpu`, `cuda:0` :param profiling: if True, profile the execution of every node, if can be sorted by name or type, the value for this parameter should e in `(None, 'name', 'type')`, :return: dictionary or a tuple (dictionary, dataframe) if the profiling is enable .. cmdref:: :title: Measures model latency :cmd: -m mlprodict latency --help :lid: l-cmd-latency The command generates random inputs and call many times the model on these inputs. It returns the processing time for one iteration. Example:: python -m mlprodict latency --model "model.onnx" """ if isinstance(model, str) and not os.path.exists(model): raise FileNotFoundError( # pragma: no cover "Unable to find model %r." % model) if profiling not in (None, '', 'name', 'type'): raise ValueError("Unexpected value for profiling: %r." % profiling) size = int(size) number = int(number) repeat = int(repeat) if max_time in (None, 0, ""): max_time = None else: max_time = float(max_time) if max_time <= 0: max_time = None if law != "normal": raise ValueError("Only law='normal' is supported, not %r." % law) if device in ('cpu', 'CPUExecutionProviders'): providers = ['CPUExecutionProviders'] elif device in ('cuda:0', 'CUDAExecutionProviders'): if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = ['CUDAExecutionProviders'] elif ',' in device: if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = device.split(',') allp = set(get_all_providers()) for p in providers: if p not in allp: raise ValueError( "One device or provider %r is not supported among %r." "" % (p, allp)) else: raise ValueError( # pragma no cover "Device %r not supported." % device) if runtime == "onnxruntime": if profiling in ('name', 'type'): so = SessionOptions() so.enable_profiling = True sess = InferenceSession(model, sess_options=so) else: sess = InferenceSession(model) fct = lambda feeds: sess.run(None, feeds) inputs = sess.get_inputs() else: if profiling in ('name', 'type'): runtime_options = {"enable_profiling": True} if runtime != 'onnxruntime1': raise NotImplementedError( # pragma: no cover "Profiling is not implemented for runtime=%r." % runtime) else: runtime_options = None oinf = OnnxInference(model, runtime=runtime, runtime_options=runtime_options) fct = lambda feeds: oinf.run(feeds) inputs = oinf.obj.graph.input feeds = random_feed(inputs, size) res = measure_time(lambda: fct(feeds), number=number, repeat=repeat, context={}, max_time=max_time, div_by_number=True) for k, v in feeds.items(): res["shape(%s)" % k] = "x".join(map(str, v.shape)) if profiling in ('name', 'type'): if runtime == 'onnxruntime': profile_name = sess.end_profiling() with open(profile_name, 'r', encoding='utf-8') as f: js = json.load(f) js = OnnxWholeSession.process_profiling(js) df = DataFrame(js) else: df = oinf.get_profiling(as_df=True) if profiling == 'name': gr = df[['dur', "args_op_name", "name"]].groupby(["args_op_name", "name"]).sum().sort_values('dur') else: gr = df[['dur', "args_op_name" ]].groupby("args_op_name").sum().sort_values('dur') return res, gr return res
def benchmark_equation(): # equations ort_where = build_ort_where() ort_where_add = build_ort_where_add() res = [] for dim in tqdm([8, 16, 32, 64, 100, 128, 200, 256, 500, 512, 1024, 2048]): repeat = 5 number = 10 conds = [(numpy.random.rand(dim, dim) < 0.5).astype(numpy.bool_) for _ in range(repeat)] xs = [numpy.random.rand(dim, dim).astype(numpy.float32) for _ in range(repeat)] ys = [numpy.random.rand(dim, dim).astype(numpy.float32) for _ in range(repeat)] # numpy ctx = dict(conds=conds, xs=xs, ys=ys, where=numpy.where, loop_where=loop_where) obs = measure_time( "loop_where(where, conds, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy.where' res.append(obs) # numpy add ctx['where'] = numpy_where_add obs = measure_time( "loop_where(where, conds, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy_where_add' res.append(obs) # onnxruntime ctx['where'] = ort_where obs = measure_time( "loop_where(where, conds, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_where' res.append(obs) # onnxruntime - 2 ctx['where'] = ort_where_add ctx['conds'] = [c.astype(numpy.float32) for c in conds] obs = measure_time( "loop_where(where, conds, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_where_add' res.append(obs) if tf_where is not None: # tensorflow ctx['where'] = tf_where ctx['conds'] = [convert_to_tensor(c) for c in conds] ctx['xs'] = [convert_to_tensor(x) for x in xs] ctx['ys'] = [convert_to_tensor(y) for y in ys] obs = measure_time( "loop_where(where, conds, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'tf_where' res.append(obs) if torch_where is not None: # torch ctx['where'] = torch_where ctx['conds'] = [from_numpy(c) for c in conds] ctx['xs'] = [from_numpy(x) for x in xs] ctx['ys'] = [from_numpy(y) for y in ys] obs = measure_time( "loop_where(where, conds, xs, ys)", div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'torch_where' res.append(obs) # Dataframes df = pandas.DataFrame(res) piv = df.pivot('dim', 'fct', 'average') rs = piv.copy() rs['ort_where'] = rs['numpy.where'] / rs['ort_where'] rs['numpy_where_add'] = rs['numpy.where'] / rs['numpy_where_add'] rs['ort_where_add'] = rs['numpy.where'] / rs['ort_where_add'] if 'tf_where' in rs.columns: rs['tf_where'] = rs['numpy.where'] / rs['tf_where'] if 'torch_where' in rs.columns: rs['torch_where'] = rs['numpy.where'] / rs['torch_where'] rs['numpy.where'] = 1. # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title="Where benchmark -- (N, N)\nlower better") ax[0].legend(prop={"size": 9}) rs.plot(logx=True, logy=True, ax=ax[1], title="Where Speedup, baseline=numpy -- (N, N)\nhigher better") ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return df, rs, ax