Example #1
0
def benchmark_op(repeat=10,
                 number=10,
                 name="Slice",
                 shape_slice_fct=None,
                 save=None,
                 opset=14,
                 repeat_profile=1500,
                 verbose=1):
    if verbose:
        print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d"
              " opset=%d." % (repeat, number, repeat_profile, opset))
    res = []
    for dim in tqdm([
            8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000,
            1024, 1200
    ]):
        shape, slices = shape_slice_fct(dim)
        onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save,
                                                          op_version=opset,
                                                          slices=slices)

        n_arrays = 20
        if dim >= 512:
            n_arrays = 10
        xs = [
            numpy.random.rand(*shape).astype(numpy.float32)
            for _ in range(n_arrays)
        ]
        info = dict(shape=shape)

        ctx = dict(xs=xs, loop_fct=loop_fct)

        # numpy
        ctx['fct'] = npy_fct
        obs = measure_time(lambda: loop_fct(npy_fct, xs),
                           div_by_number=True,
                           context=ctx,
                           repeat=repeat,
                           number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs['shape'] = ",".join(map(str, shape))
        obs['slices'] = str(slices)
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(lambda: loop_fct(ort_fct, xs),
                           div_by_number=True,
                           context=ctx,
                           repeat=repeat,
                           number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs['shape'] = ",".join(map(str, shape))
        obs['slices'] = str(slices)
        obs.update(info)
        res.append(obs)

        if ort_fct_gpu is not None:

            # onnxruntime
            dev = get_ort_device('cuda:0')
            ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs]
            ctx['fct'] = ort_fct_gpu
            obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']),
                               div_by_number=True,
                               context=ctx,
                               repeat=repeat,
                               number=number)
            obs['dim'] = dim
            obs['fct'] = 'ort_gpu'
            obs['shape'] = ",".join(map(str, shape))
            obs['slices'] = str(slices)
            obs.update(info)
            res.append(obs)

    # profiling CPU
    if verbose:
        print("[benchmark_op] done.")
        print("[benchmark_op] profile CPU.")
    so = SessionOptions()
    so.enable_profiling = True
    sess = InferenceSession(onx.SerializeToString(),
                            so,
                            providers=["CPUExecutionProvider"])
    for i in range(0, repeat_profile):
        sess.run(
            None,
            {'X': xs[-1]},
        )
    prof = sess.end_profiling()
    with open(prof, "r") as f:
        js = json.load(f)
    dfprof = DataFrame(OnnxWholeSession.process_profiling(js))
    dfprof['shape'] = ",".join(map(str, shape))
    dfprof['slices'] = str(slices)
    if verbose:
        print("[benchmark_op] done.")

    # profiling CPU
    if ort_fct_gpu is not None:
        if verbose:
            print("[benchmark_op] profile GPU.")
        so = SessionOptions()
        so.enable_profiling = True
        sess = InferenceSession(onx.SerializeToString(),
                                so,
                                providers=["CUDAExecutionProvider"])
        io_binding = sess.io_binding()._iobinding
        device = get_ort_device('cpu')

        for i in range(0, repeat_profile):
            x = ctx['xs'][-1]
            io_binding.bind_input('X', device, numpy.float32, x.shape(),
                                  x.data_ptr())
            io_binding.bind_output('Y', device)
            sess._sess.run_with_iobinding(io_binding, None)

        prof = sess.end_profiling()
        with open(prof, "r") as f:
            js = json.load(f)
        dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js))
        dfprofgpu['shape'] = ",".join(map(str, shape))
        dfprofgpu['slices'] = str(slices)
        if verbose:
            print("[benchmark_op] profile done.")
    else:
        dfprofgpu = None

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    piv = df.pivot('shape', 'fct', 'average')

    rs = piv.copy()
    for c in ['numpy', 'ort', 'ort_gpu']:
        if c in rs.columns:
            rs[f"numpy/{c}"] = rs['numpy'] / rs[c]
    rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy()

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True,
             logy=True,
             ax=ax[0],
             title=f"{name} benchmark\n{shape_name!r} lower better")
    ax[0].legend(prop={"size": 9})
    rs.plot(
        logx=True,
        logy=True,
        ax=ax[1],
        title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better")
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return dfprof, dfprofgpu, df, rs, ax
Example #2
0
class InferenceSession:  # pylint: disable=E0102
    """
    Wrappers around InferenceSession from :epkg:`onnxruntime`.

    :param onnx_bytes: onnx bytes
    :param session_options: session options
    :param log_severity_level: change the logging level
    :param device: device, a string `cpu`, `cuda`, `cuda:0`...
    """
    def __init__(self,
                 onnx_bytes,
                 sess_options=None,
                 log_severity_level=4,
                 device=None):
        if InferenceSession is None:
            raise ImportError(  # pragma: no cover
                "onnxruntime is not available.")
        self.log_severity_level = log_severity_level
        if device is None:
            self.device = get_ort_device('cpu')
        else:
            self.device = get_ort_device(device)
        self.providers = device_to_providers(self.device)
        set_default_logger_severity(3)
        if sess_options is None:
            self.so = SessionOptions()
            self.so.log_severity_level = log_severity_level
            self.sess = OrtInferenceSession(onnx_bytes,
                                            sess_options=self.so,
                                            providers=self.providers)
        else:
            self.so = sess_options
            self.sess = OrtInferenceSession(onnx_bytes,
                                            sess_options=sess_options,
                                            providers=self.providers)
        self.ro = RunOptions()
        self.ro.log_severity_level = log_severity_level
        self.ro.log_verbosity_level = log_severity_level
        self.output_names = [o.name for o in self.get_outputs()]

    def run(self, output_names, input_feed, run_options=None):
        """
        Executes the ONNX graph.

        :param output_names: None for all, a name for a specific output
        :param input_feed: dictionary of inputs
        :param run_options: None or RunOptions
        :return: array
        """
        if any(map(lambda v: isinstance(v, C_OrtValue), input_feed.values())):
            return self.sess._sess.run_with_ort_values(input_feed,
                                                       self.output_names,
                                                       run_options or self.ro)
        return self.sess.run(output_names, input_feed, run_options or self.ro)

    def get_inputs(self):
        "Returns input types."
        return self.sess.get_inputs()

    def get_outputs(self):
        "Returns output types."
        return self.sess.get_outputs()

    def end_profiling(self):
        "Ends profiling."
        return self.sess.end_profiling()
        bind.bind_input(name, ort_device, dtype, value.shape(),
                        value.data_ptr())
    for out in outputs:
        bind.bind_output(out, ort_device)
    sess._sess.run_with_iobinding(bind, None)
    ortvalues = bind.get_outputs()
    return [o.numpy() for o in ortvalues]


#######################################
# The profiling.

for i in tqdm(range(0, 10)):
    run_with_iobinding(sess, bind, ort_device, feed_ort_value, outputs)

prof = sess.end_profiling()
with open(prof, "r") as f:
    js = json.load(f)
df = pandas.DataFrame(OnnxWholeSession.process_profiling(js))
df

###################################
# First graph is by operator type.

gr_dur = df[['dur',
             "args_op_name"]].groupby("args_op_name").sum().sort_values('dur')
total = gr_dur['dur'].sum()
gr_dur /= total
gr_n = df[['dur',
           "args_op_name"]].groupby("args_op_name").count().sort_values('dur')
gr_n = gr_n.loc[gr_dur.index, :]
Example #4
0
def latency(model,
            law='normal',
            size=1,
            number=10,
            repeat=10,
            max_time=0,
            runtime="onnxruntime",
            device='cpu',
            profiling=None):
    """
    Measures the latency of a model (python API).

    :param model: ONNX graph
    :param law: random law used to generate fake inputs
    :param size: batch size, it replaces the first dimension
        of every input if it is left unknown
    :param number: number of calls to measure
    :param repeat: number of times to repeat the experiment
    :param max_time: if it is > 0, it runs as many time during
        that period of time
    :param runtime: available runtime
    :param device: device, `cpu`, `cuda:0`
    :param profiling: if True, profile the execution of every
        node, if can be sorted by name or type,
        the value for this parameter should e in `(None, 'name', 'type')`,
    :return: dictionary or a tuple (dictionary, dataframe)
        if the profiling is enable

    .. cmdref::
        :title: Measures model latency
        :cmd: -m mlprodict latency --help
        :lid: l-cmd-latency

        The command generates random inputs and call many times the
        model on these inputs. It returns the processing time for one
        iteration.

        Example::

            python -m mlprodict latency --model "model.onnx"
    """
    if isinstance(model, str) and not os.path.exists(model):
        raise FileNotFoundError(  # pragma: no cover
            "Unable to find model %r." % model)
    if profiling not in (None, '', 'name', 'type'):
        raise ValueError("Unexpected value for profiling: %r." % profiling)
    size = int(size)
    number = int(number)
    repeat = int(repeat)
    if max_time in (None, 0, ""):
        max_time = None
    else:
        max_time = float(max_time)
        if max_time <= 0:
            max_time = None

    if law != "normal":
        raise ValueError("Only law='normal' is supported, not %r." % law)

    if device in ('cpu', 'CPUExecutionProviders'):
        providers = ['CPUExecutionProviders']
    elif device in ('cuda:0', 'CUDAExecutionProviders'):
        if runtime != 'onnxruntime':
            raise NotImplementedError(  # pragma: no cover
                "Only runtime 'onnxruntime' supports this device or provider "
                "%r." % device)
        providers = ['CUDAExecutionProviders']
    elif ',' in device:
        if runtime != 'onnxruntime':
            raise NotImplementedError(  # pragma: no cover
                "Only runtime 'onnxruntime' supports this device or provider "
                "%r." % device)
        providers = device.split(',')
        allp = set(get_all_providers())
        for p in providers:
            if p not in allp:
                raise ValueError(
                    "One device or provider %r is not supported among %r."
                    "" % (p, allp))
    else:
        raise ValueError(  # pragma no cover
            "Device %r not supported." % device)

    if runtime == "onnxruntime":
        if profiling in ('name', 'type'):
            so = SessionOptions()
            so.enable_profiling = True
            sess = InferenceSession(model, sess_options=so)
        else:
            sess = InferenceSession(model)
        fct = lambda feeds: sess.run(None, feeds)
        inputs = sess.get_inputs()
    else:
        if profiling in ('name', 'type'):
            runtime_options = {"enable_profiling": True}
            if runtime != 'onnxruntime1':
                raise NotImplementedError(  # pragma: no cover
                    "Profiling is not implemented for runtime=%r." % runtime)
        else:
            runtime_options = None
        oinf = OnnxInference(model,
                             runtime=runtime,
                             runtime_options=runtime_options)
        fct = lambda feeds: oinf.run(feeds)
        inputs = oinf.obj.graph.input

    feeds = random_feed(inputs, size)
    res = measure_time(lambda: fct(feeds),
                       number=number,
                       repeat=repeat,
                       context={},
                       max_time=max_time,
                       div_by_number=True)
    for k, v in feeds.items():
        res["shape(%s)" % k] = "x".join(map(str, v.shape))
    if profiling in ('name', 'type'):
        if runtime == 'onnxruntime':
            profile_name = sess.end_profiling()
            with open(profile_name, 'r', encoding='utf-8') as f:
                js = json.load(f)
            js = OnnxWholeSession.process_profiling(js)
            df = DataFrame(js)
        else:
            df = oinf.get_profiling(as_df=True)
        if profiling == 'name':
            gr = df[['dur', "args_op_name",
                     "name"]].groupby(["args_op_name",
                                       "name"]).sum().sort_values('dur')
        else:
            gr = df[['dur', "args_op_name"
                     ]].groupby("args_op_name").sum().sort_values('dur')
        return res, gr

    return res