def compile_and_run_vm(mod, params, data_np, target):
        with tvm.transform.PassContext(opt_level=3):
            vm_exec = relay.vm.compile(mod, target=target, params=params)

        dev = tvm.device(target, 0)
        vm = VirtualMachine(vm_exec, dev)
        vm.set_input("main", **{input_name: data_np})
        return vm.run()
Beispiel #2
0
    def f_timer(
        rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable],
        dev: tvm.device,
        input_data: Dict[str, NDArray],
    ) -> None:
        """Run and benchmark the given runtime module, print out the result.

        Parameters
        ----------
        rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable]
            The runtime module or vm executable.
        dev : tvm.device
            The device type to run workload.
        input_data : Dict[str, np.ndarray]
            The input data as a dictionary.
        """
        from tvm.contrib.graph_executor import GraphModule  # pylint:disable=import-outside-toplevel
        from tvm.runtime.vm import VirtualMachine  # pylint:disable=import-outside-toplevel

        try:
            if backend == "vm":
                vm = VirtualMachine(rt_mod, dev)  # pylint: disable=invalid-name
                ftimer = vm.benchmark(dev,
                                      min_repeat_ms=500,
                                      repeat=5,
                                      number=1,
                                      end_to_end=False,
                                      **input_data)
            elif backend == "graph":
                mod = GraphModule(rt_mod["default"](dev))
                for input_name, input_value in input_data.items():
                    mod.set_input(input_name, input_value)
                ftimer = mod.module.time_evaluator("run",
                                                   dev,
                                                   min_repeat_ms=500,
                                                   repeat=5,
                                                   number=1)()
            else:
                raise ValueError(
                    f"Backend {backend} not supported in f_timer!")

            results = list(np.array(ftimer.results) * 1000.0)  # type: ignore

            print("Running time in time_evaluator: ", results)
            print("-------------------------------")
            print(f"    Min (ms) : {min(results)}")
            print(f"    Max (ms) : {max(results)}")
            print(f" Median (ms) : {median(results)}")
            print(f"Average (ms) : {sum(results) / len(results)}")
        except Exception as exc:  # pylint: disable=broad-except
            print(f"Run module f_timer via RPC failed, exception: {exc}", )
def test_detection_models():
    img = "test_street_small.jpg"
    img_url = ("https://raw.githubusercontent.com/dmlc/web-data/"
               "master/gluoncv/detection/street_small.jpg")
    download(img_url, img)

    input_shape = (1, 3, in_size, in_size)
    target = "llvm"
    input_name = "input0"
    shape_list = [(input_name, input_shape)]
    score_threshold = 0.9

    scripted_model = generate_jit_model(1)
    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

    with tvm.transform.PassContext(opt_level=3,
                                   disabled_pass=["FoldScaleAxis"]):
        vm_exec = relay.vm.compile(mod, target=target, params=params)

    ctx = tvm.cpu()
    vm = VirtualMachine(vm_exec, ctx)
    data = process_image(img)
    pt_res = scripted_model(data)
    data = data.detach().numpy()
    vm.set_input("main", **{input_name: data})
    tvm_res = vm.run()

    # Note: due to accumulated numerical error, we can't directly compare results
    # with pytorch output. Some boxes might have a quite tiny difference in score
    # and the order can become different. We just measure how many valid boxes
    # there are for input image.
    pt_scores = pt_res[1].detach().numpy().tolist()
    tvm_scores = tvm_res[1].asnumpy().tolist()
    num_pt_valid_scores = num_tvm_valid_scores = 0

    for score in pt_scores:
        if score >= score_threshold:
            num_pt_valid_scores += 1
        else:
            break

    for score in tvm_scores:
        if score >= score_threshold:
            num_tvm_valid_scores += 1
        else:
            break

    assert num_pt_valid_scores == num_tvm_valid_scores, (
        "Output mismatch: Under score threshold {}, Pytorch has {} valid "
        "boxes while TVM has {}.".format(score_threshold, num_pt_valid_scores,
                                         num_tvm_valid_scores))
Beispiel #4
0
def get_ref_vm(mod, params, target="cuda"):
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target=target, params=params)
        code, lib = vm_exec.save()
    dev = tvm.device(target, 0)
    vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
    return VirtualMachine(vm_exec, dev), dev
Beispiel #5
0
def profile_and_build_vm(
    mod,
    params,
    sm,
    split_k_slices=[1],
    tmp_dir="./tmp",
    lib_path="compile.so",
    vmcode_path="vmcode.ro",
    use_fast_math=False,
    use_3xtf32=True,
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod,
        sm,
        split_k_slices=split_k_slices,
        use_3xtf32=use_3xtf32,
        profile_all_alignments=False,
        find_first_valid=True,
        tmp_dir=tmp_dir,
    )
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
    vm_exec = build_cutlass_kernels_vm(vm_exec,
                                       sm,
                                       tmp_dir,
                                       lib_path,
                                       vmcode_path,
                                       use_fast_math=use_fast_math)
    dev = tvm.device("cuda", 0)
    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
Beispiel #6
0
def test_detection_models():
    img = "test_street_small.jpg"
    img_url = (
        "https://raw.githubusercontent.com/dmlc/web-data/"
        "master/gluoncv/detection/street_small.jpg"
    )
    download(img_url, img)

    input_shape = (1, 3, in_size, in_size)

    input_name = "input0"
    shape_list = [(input_name, input_shape)]

    scripted_model = generate_jit_model(1)
    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

    data = process_image(img)
    data_np = data.detach().numpy()

    with torch.no_grad():
        pt_res = scripted_model(data)

    for target in ["llvm", "cuda"]:
        with tvm.transform.PassContext(opt_level=3):
            vm_exec = relay.vm.compile(mod, target=target, params=params)

        ctx = tvm.context(target, 0)
        vm = VirtualMachine(vm_exec, ctx)

        vm.set_input("main", **{input_name: data_np})
        tvm_res = vm.run()

        # Bounding boxes
        tvm.testing.assert_allclose(
            pt_res[0].cpu().numpy(), tvm_res[0].asnumpy(), rtol=1e-5, atol=1e-5
        )
        # Scores
        tvm.testing.assert_allclose(
            pt_res[1].cpu().numpy(), tvm_res[1].asnumpy(), rtol=1e-5, atol=1e-5
        )
        # Class ids
        np.testing.assert_equal(pt_res[2].cpu().numpy(), tvm_res[2].asnumpy())

        score_threshold = 0.9
        print("Num boxes:", pt_res[0].cpu().numpy().shape[0])
        print("Num valid boxes:", np.sum(pt_res[1].cpu().numpy() >= score_threshold))
Beispiel #7
0
def profile_and_build_vm(
    mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro"
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir)
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
    vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path)
    dev = tvm.device("cuda", 0)
    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
Beispiel #8
0
def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1):
    sim_q = relay.qnn.op.simulated_quantize(
        input_data,
        scale,
        zp,
        axis=axis,
        out_dtype=dtype,
    )
    mod = tvm.IRModule.from_expr(sim_q)
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, "llvm", params=None)
    vm = VirtualMachine(vm_exec, tvm.cpu(0))
    return vm
Beispiel #9
0
# best performance, due to the existence of large dense operator in
# torchvision rcnn models.

# Add "-libs=mkl" to get best performance on x86 target.
# For x86 machine supports AVX512, the complete target is
# "llvm -mcpu=skylake-avx512 -libs=mkl"
target = "llvm"

with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
    vm_exec = relay.vm.compile(mod, target=target, params=params)

######################################################################
# Inference with Relay VM
# -----------------------
dev = tvm.cpu()
vm = VirtualMachine(vm_exec, dev)
vm.set_input("main", **{input_name: img})
tvm_res = vm.run()

######################################################################
# Get boxes with score larger than 0.9
# ------------------------------------
score_threshold = 0.9
boxes = tvm_res[0].numpy().tolist()
valid_boxes = []
for i, score in enumerate(tvm_res[1].numpy().tolist()):
    if score > score_threshold:
        valid_boxes.append(boxes[i])
    else:
        break
Beispiel #10
0
 def run_vm(code, lib):
     vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
     vm = VirtualMachine(vm_exec, tvm.gpu(0))
     result = vm.invoke("main", data=i_data)
     return result
Beispiel #11
0
def run_vm(vm_exec, input_, ctx=tvm.cpu(0)):
    vm = VirtualMachine(vm_exec, ctx)
    _out = vm.invoke("main", input_)
    return vmobj_to_list(_out)
Beispiel #12
0
def run_case(dtype, image, target):
    # Check image
    import os
    import json
    import sys

    STAT_REPEAT=os.environ.get('STAT_REPEAT','')
    if STAT_REPEAT=='' or STAT_REPEAT==None:
       STAT_REPEAT=10
    STAT_REPEAT=int(STAT_REPEAT)

    # FGG: set model files via CK env
    CATEG_FILE = '../synset.txt'
    synset = eval(open(os.path.join(CATEG_FILE)).read())

    files=[]
    val={}

    # FGG: set timers
    import time
    timers={}

    img_orig = cv2.imread(image)

    img = cv2.cvtColor(img_orig, cv2.COLOR_BGR2RGB)

    output_height, output_width, _ = 224, 224, 3
    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2.INTER_AREA)
    img = center_crop(img, output_height, output_width)
    img = np.asarray(img, dtype='float32')

    # normalize image
    means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
    img -= means

    # transpose if needed
    img = img.transpose([2, 0, 1])

    import matplotlib.pyplot as plt
    img1 = img.transpose([1, 2, 0])
    arr_ = np.squeeze(img1) # you can give axis attribute if you wanna squeeze in specific dimension
    plt.imshow(arr_)
#    plt.show()
    plt.savefig('pre-processed-image.png')

    # Load model
    model_path=os.environ.get('CK_ENV_ONNX_MODEL_ONNX_FILEPATH','')
    if model_path=='':
        print ('Error: environment variable CK_ENV_ONNX_MODEL_ONNX_FILEPATH is not defined')
        exit(1)

    opt = rt.SessionOptions()

    sess = rt.InferenceSession(model_path, opt)

    inputs = [meta.name for meta in sess.get_inputs()]
    outputs = [meta.name for meta in sess.get_outputs()]

    print (inputs)
    print (outputs)




    if os.environ.get('USE_TVM','')=='yes':
        import tvm
        from tvm import relay
        import onnx

        del sess

        # Load model via ONNX to be used with TVM
        print ('')
        print ('ONNX: load model ...')
        print ('')

        onnx_model = onnx.load(model_path)

        # Init TVM
        # TBD: add tvm platform selector
        if os.environ.get('USE_CUDA','')=='yes':
           # CK TVM package must be built with CUDA enabled
           ctx = tvm.cuda(0)
        else:
           ctx = tvm.cpu(0)
        tvm_ctx = ctx

        build_conf = {'relay.backend.use_auto_scheduler': False}
        opt_lvl = int(os.environ.get('TVM_OPT_LEVEL', 3))

        target = os.environ.get('TVM_TARGET', 'llvm -mcpu=znver2')

        target_host=None
        params={}

        # New target API
        tvm_target = tvm.target.Target(target, host=target_host)

        input_shape = (1, 3, 224, 224)
        shape_dict = {inputs[0]: input_shape}

        print ('')
        print ('TVM: import model ...')
        print ('')
        # Extra param: opset=12
        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)

        print ('')
        print ('TVM: transform to static ...')
        print ('')
        mod = relay.transform.DynamicToStatic()(mod)

        print ('')
        print ('TVM: apply extra optimizations ...')
        print ('')
        # Padding optimization
        # Adds extra optimizations
        mod = relay.transform.FoldExplicitPadding()(mod)


        print ('')
        print ('TVM: build model ...')
        print ('')

        executor=os.environ.get('MLPERF_TVM_EXECUTOR','graph')

        if executor == "graph" or executor == "debug":
            from tvm.contrib import graph_executor

            # Without history
            with tvm.transform.PassContext(opt_level=opt_lvl, config=build_conf):
                graph_module = relay.build(mod,
                                           target=tvm_target,
                                           params=params)
            lib = graph_module

            print ('')
            print ('TVM: init graph engine ...')
            print ('')

            sess = graph_executor.GraphModule(lib['default'](ctx))


        elif executor == "vm":
            from tvm.runtime.vm import VirtualMachine

            # Without history
            with tvm.transform.PassContext(opt_level=opt_lvl, config=build_conf):
                vm_exec = relay.vm.compile(mod, target=tvm_target, params=params)

            r_exec = vm_exec

            print ('')
            print ('TVM: init VM ...')
            print ('')

            sess = VirtualMachine(r_exec, ctx)


        # For now only graph
        sess.set_input(inputs[0], tvm.nd.array([img]))

        # Run TVM inference
        sess.run()

        # Process TVM outputs
        output = []

        for i in range(sess.get_num_outputs()):
            # Take only the output of batch size for dynamic batches
            if len(output)<(i+1):
                output.append([])
            output[i].append(sess.get_output(i).asnumpy()[0])



    else:
       inp={inputs[0]:np.array([img], dtype=np.float32)}
       output=sess.run(outputs, inp)




    top1 = np.argmax(output[1]) #.asnumpy())

    top5=[]
    atop5 = get_top5(output[1][0]) #.asnumpy())

    print ('')
    print('Prediction Top1:', top1, synset[top1])

    print ('')
    print('Prediction Top5:')
    for p in atop5:
        out=p[1]
        name=synset[out]
        print (' * {} {}'.format(out, name))

    ck_results={
      'prediction':synset[top1]
    }

    with open('tmp-ck-timer.json', 'w') as ck_results_file:
       json.dump(ck_results, ck_results_file, indent=2, sort_keys=True)

    return