Ejemplo n.º 1
0
    def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size,
                           pool_type, layout):
        dtype = "float32"
        np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
        np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type,
                                                layout)
        oshape = np_out.shape

        data = te.placeholder(dshape, name="data", dtype=dtype)
        if len(out_size) == 2:
            out = topi.nn.adaptive_pool(data, out_size, pool_type, layout)
        else:
            assert len(out_size) == 3
            out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout)

        target_hexagon = tvm.target.hexagon("v68")
        with tvm.target.Target(target_hexagon):
            fschedule = topi.hexagon.schedule_adaptive_pool
            s = fschedule(out)

        func = tvm.build(
            s,
            [data, out],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name="adaptive-pool",
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(np_data, dev)
        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype),
                         dev)
        mod["adaptive-pool"](a, b)

        tvm.testing.assert_allclose(b.numpy(), np_out, rtol=4e-5, atol=1e-6)
Ejemplo n.º 2
0
def test_add(hexagon_session: Session):
    dtype = "int8"
    A = tvm.te.placeholder((2, ), dtype=dtype)
    B = tvm.te.placeholder((1, ), dtype=dtype)
    C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
    sched = tvm.te.create_schedule(C.op)

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(sched, [A, B, C],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="add")

    mod = hexagon_session.load_module(func)

    A_data = tvm.nd.array(np.array([2, 3], dtype=dtype),
                          device=hexagon_session.device)
    assert (A_data.numpy() == np.array([2, 3])).all()
    B_data = tvm.nd.array(np.array([4], dtype=dtype),
                          device=hexagon_session.device)
    assert (B_data.numpy() == np.array([4])).all()
    C_data = tvm.nd.array(np.array([0, 0], dtype=dtype),
                          device=hexagon_session.device)
    assert (C_data.numpy() == np.array([0, 0])).all()
    mod["add"](A_data, B_data, C_data)
    assert (C_data.numpy() == np.array([6, 7])).all()
Ejemplo n.º 3
0
def test_add_vtcm(hexagon_session: Session):
    dtype = "int8"
    A = tvm.te.placeholder((2, ), dtype=dtype)
    B = tvm.te.placeholder((1, ), dtype=dtype)
    C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
    sched = tvm.te.create_schedule(C.op)

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(sched, [A, B, C],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="add")

    mod = hexagon_session.load_module(func)

    A_data = tvm.nd.empty(A.shape, A.dtype, hexagon_session.device,
                          "global.vtcm")
    A_data.copyfrom(np.array([2, 3]))

    B_data = tvm.nd.empty(B.shape, B.dtype, hexagon_session.device,
                          "global.vtcm")
    B_data.copyfrom(np.array([4]))

    C_data = tvm.nd.empty(C.shape, C.dtype, hexagon_session.device,
                          "global.vtcm")
    C_data.copyfrom(np.array([0, 0]))

    mod["add"](A_data, B_data, C_data)
    result = C_data.numpy()
    assert (result == np.array([6, 7])).all()
Ejemplo n.º 4
0
def verify(hexagon_session: Session, schedule, x_tensor, y_tensor, z_tensor,
           size):
    """Verify correctness with reference from numpy"""
    print(tvm.lower(schedule, [x_tensor, y_tensor, z_tensor]))

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(
        schedule,
        [x_tensor, y_tensor, z_tensor],
        tvm.target.Target(target_hexagon, host=target_hexagon),
        name="dmacpy",
    )

    mod = hexagon_session.load_module(func)
    x_array = tvm.nd.array(
        np.random.randint(low=-128, high=127, size=size, dtype=x_tensor.dtype),
        device=hexagon_session.device,
    )
    y_array = tvm.nd.array(
        np.random.randint(low=-128, high=127, size=size, dtype=y_tensor.dtype),
        device=hexagon_session.device,
    )
    z_array = tvm.nd.array(
        np.random.randint(low=-128, high=127, size=size, dtype=z_tensor.dtype),
        device=hexagon_session.device,
    )
    mod["dmacpy"](x_array, y_array, z_array)

    ref = x_array.numpy() + y_array.numpy()
    np.testing.assert_equal(z_array.numpy(), ref)
Ejemplo n.º 5
0
def test_add_vtcm(hexagon_session: Session):
    """Test add on VTCM"""
    dtype = "int8"
    placeholder_a = tvm.te.placeholder((2, ), dtype=dtype)
    placeholder_b = tvm.te.placeholder((1, ), dtype=dtype)
    compute_c = tvm.te.compute(placeholder_a.shape,
                               lambda i: placeholder_a[i] + placeholder_b[0],
                               name="C")
    sched = tvm.te.create_schedule(compute_c.op)

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(
        sched,
        [placeholder_a, placeholder_b, compute_c],
        tvm.target.Target(target_hexagon, host=target_hexagon),
        name="add",
    )

    mod = hexagon_session.load_module(func)

    a_data = tvm.nd.empty(placeholder_a.shape, placeholder_a.dtype,
                          hexagon_session.device, "global.vtcm")
    a_data.copyfrom(np.array([2, 3]))

    b_data = tvm.nd.empty(placeholder_b.shape, placeholder_b.dtype,
                          hexagon_session.device, "global.vtcm")
    b_data.copyfrom(np.array([4]))

    c_data = tvm.nd.empty(compute_c.shape, compute_c.dtype,
                          hexagon_session.device, "global.vtcm")
    c_data.copyfrom(np.array([0, 0]))

    mod["add"](a_data, b_data, c_data)
    result = c_data.numpy()
    assert (result == np.array([6, 7])).all()
Ejemplo n.º 6
0
def test_add(hexagon_session: Session):
    """Test simple add"""
    dtype = "int8"
    placeholder_a = tvm.te.placeholder((2, ), dtype=dtype)
    placeholder_b = tvm.te.placeholder((1, ), dtype=dtype)
    compute_c = tvm.te.compute(placeholder_a.shape,
                               lambda i: placeholder_a[i] + placeholder_b[0],
                               name="C")
    sched = tvm.te.create_schedule(compute_c.op)

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(
        sched,
        [placeholder_a, placeholder_b, compute_c],
        tvm.target.Target(target_hexagon, host=target_hexagon),
        name="add",
    )

    mod = hexagon_session.load_module(func)

    a_data = tvm.nd.array(np.array([2, 3], dtype=dtype),
                          device=hexagon_session.device)
    assert (a_data.numpy() == np.array([2, 3])).all()
    b_data = tvm.nd.array(np.array([4], dtype=dtype),
                          device=hexagon_session.device)
    assert (b_data.numpy() == np.array([4])).all()
    c_data = tvm.nd.array(np.array([0, 0], dtype=dtype),
                          device=hexagon_session.device)
    assert (c_data.numpy() == np.array([0, 0])).all()
    mod["add"](a_data, b_data, c_data)
    assert (c_data.numpy() == np.array([6, 7])).all()
Ejemplo n.º 7
0
def verify(hexagon_session: Session, s, x, y, z, size):
    print(tvm.lower(s, [x, y, z]))

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(s, [x, y, z],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="dmacpy")

    mod = hexagon_session.load_module(func)
    xt = tvm.nd.array(
        np.random.randint(low=-128, high=127, size=size, dtype=x.dtype),
        device=hexagon_session.device,
    )
    yt = tvm.nd.array(
        np.random.randint(low=-128, high=127, size=size, dtype=y.dtype),
        device=hexagon_session.device,
    )
    zt = tvm.nd.array(
        np.random.randint(low=-128, high=127, size=size, dtype=z.dtype),
        device=hexagon_session.device,
    )
    mod["dmacpy"](xt, yt, zt)

    ref = xt.numpy() + yt.numpy()
    np.testing.assert_equal(zt.numpy(), ref)
Ejemplo n.º 8
0
def test_graph_executor(hexagon_session: Session):
    """Test graph executor"""
    dtype = "float32"
    data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
    weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
    conv2d_op = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], conv2d_op)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    target_hexagon = tvm.target.hexagon("v68")
    runtime = Runtime("cpp")
    executor = Executor("graph")

    weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype)
    data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
    params = {"weight": weight_in}
    inputs = {"data": data_in}

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_hexagon, host=target_hexagon),
            runtime=runtime,
            executor=executor,
        )

    graph_mod = hexagon_session.get_executor_from_factory(lowered)
    graph_mod.set_input(**params)
    graph_mod.run(**inputs)
    hexagon_output = graph_mod.get_output(0).numpy()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=runtime,
            executor=executor,
        )
    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Ejemplo n.º 9
0
def test_elemwise_sum_parallel(hexagon_session: Session):
    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(
        ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
    )
    mod = hexagon_session.load_module(func)

    (a, b, c, n) = generate_add_test_data(hexagon_session)
    mod["elemwise_sum_parallel"](a, b, c, n)
    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
Ejemplo n.º 10
0
def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target,
                       enable_usmp):
    if hexagon_session._launcher._serial_number == "simulator":
        pytest.skip(msg="Skip on simulator due to long runtime.")

    dtype = "float32"
    onnx_model = get_mobilenet()

    data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)

    input_name = "input"
    shape_dict = {input_name: data_in.shape}
    relay_mod, params = relay.frontend.from_onnx(onnx_model,
                                                 shape_dict,
                                                 freeze_params=True)
    inputs = {input_name: data_in}

    target_llvm = tvm.target.Target("llvm")
    config = {"tir.usmp.enable": enable_usmp}
    with tvm.transform.PassContext(opt_level=3, config=config):
        hexagon_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(aot_target, host=aot_host_target),
            runtime=Runtime("cpp"),
            executor=Executor("aot", {
                "unpacked-api": False,
                "interface-api": "packed"
            }),
            params=params,
        )

    aot_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
    aot_mod.set_input(**inputs)
    aot_mod.run()
    hexagon_output = aot_mod.get_output(0).numpy()

    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=Runtime("cpp"),
            executor=Executor("graph", {"link-params": True}),
            params=params,
        )

    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**inputs)
    llvm_graph_mod.run()
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Ejemplo n.º 11
0
def test_speedup(hexagon_session: Session, capsys):
    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(
        ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
    )
    mod = hexagon_session.load_module(func)
    args = generate_add_test_data(hexagon_session)
    parallel_mean = benchmark_func(mod, "elemwise_sum_parallel", args, hexagon_session)
    serial_mean = benchmark_func(mod, "elemwise_sum_serial", args, hexagon_session)

    with capsys.disabled():
        print("... speedup of {:.2f}".format(serial_mean / parallel_mean), end=" ")
Ejemplo n.º 12
0
    def test_conv2d_nhwc(
        self,
        hexagon_session: Session,
        ref_data,
        batch,
        in_channel,
        in_size,
        num_filter,
        kernel,
        dtype,
        stride,
        padding,
        dilation,
    ):
        target_hexagon = tvm.target.hexagon("v68")

        a_np, w_np, b_np = ref_data

        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
        W = te.placeholder(w_np.shape, name="W", dtype=dtype)

        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.conv2d_nhwc
            fschedule = topi.hexagon.schedule_conv2d_nhwc
            B = fcompute(A, W, stride, padding, dilation, dtype)
            s = fschedule([B])

        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
            dtype,
            batch,
            in_channel,
            in_size,
            num_filter,
            kernel,
            stride,
            padding,
            dilation,
        )
        func = tvm.build(s, [A, W, B],
                         tvm.target.Target(target_hexagon,
                                           host=target_hexagon),
                         name=func_name)
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                         dev)

        mod[func_name](a, w, b)
        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
Ejemplo n.º 13
0
def test_mobilenet(hexagon_session: Session):
    dtype = "float32"
    onnx_model = get_mobilenet()

    target_hexagon = tvm.target.hexagon("v68")
    target_llvm = tvm.target.Target("llvm")
    runtime = Runtime("cpp")
    executor = Executor("graph", {"link-params": True})

    data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)

    input_name = "input"
    shape_dict = {input_name: data_in.shape}
    relay_mod, params = relay.frontend.from_onnx(onnx_model,
                                                 shape_dict,
                                                 freeze_params=True)
    inputs = {input_name: data_in}

    with tvm.transform.PassContext(opt_level=3):
        hexagon_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_hexagon, host=target_hexagon),
            runtime=runtime,
            executor=executor,
            params=params,
        )

        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=runtime,
            executor=executor,
            params=params,
        )

    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
    graph_mod.set_input(**inputs)
    graph_mod.run()
    hexagon_output = graph_mod.get_output(0).numpy()

    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**inputs)
    llvm_graph_mod.run()
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Ejemplo n.º 14
0
def test_dense(
    hexagon_session: Session,
    batch_size,
    in_dim,
    out_dim,
    use_bias,
    in_dtype,
    out_dtype,
    dense_ref_data,
):
    if in_dtype == "float16":
        pytest.xfail("float16 is not supported.")

    if "int" in in_dtype:
        tol = {"atol": 0, "rtol": 0}
    elif in_dtype == "float32":
        tol = {"rtol": 1e-5, "atol": 1e-5}

    A = te.placeholder((batch_size, in_dim), name="A", dtype=in_dtype)
    B = te.placeholder((out_dim, in_dim), name="B", dtype=in_dtype)
    C = te.placeholder((out_dim, ), name="C", dtype=out_dtype)

    a_np, b_np, c_np, d_np = dense_ref_data

    fcompute = topi.nn.dense
    fschedule = topi.hexagon.schedule_dense

    target_hexagon = tvm.target.hexagon("v68")
    with tvm.target.Target(target_hexagon):
        D = fcompute(A, B, C if use_bias else None, out_dtype)
        D = topi.nn.relu(D)
        s = fschedule([D])

    func = tvm.build(s, [A, B, C, D],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="dense")
    mod = hexagon_session.load_module(func)

    dev = hexagon_session.device
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(b_np, dev)
    c = tvm.nd.array(c_np, dev)
    d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
    mod["dense"](a, b, c, d)
    tvm.testing.assert_allclose(d.numpy(), d_np, **tol)
Ejemplo n.º 15
0
    def test_batch_matmul_int8(self, hexagon_session: Session, x_batch,
                               y_batch, M, N, K):
        dtype = "int8"
        out_dtype = "int8"
        assert x_batch == y_batch or x_batch == 1 or y_batch == 1
        x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
        y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)

        def get_ref_data():
            a_np = np.random.randint(low=-128, high=127,
                                     size=(x_batch, M, K)).astype(dtype)
            b_np = np.random.randint(low=-128, high=127,
                                     size=(y_batch, N, K)).astype(dtype)
            c_np = tvm.topi.testing.batch_matmul(a_np,
                                                 b_np,
                                                 out_dtype=out_dtype)
            return (a_np, b_np, c_np)

        # get the test data
        a_np, b_np, c_np = get_ref_data()

        target_hexagon = tvm.target.hexagon("v68")
        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.batch_matmul
            fschedule = topi.hexagon.schedule_batch_matmul
            out = fcompute(x, y)
            s = fschedule([out])

        func = tvm.build(
            s,
            [x, y, out],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name="batch_matmul_int8",
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype),
                         dev)
        mod["batch_matmul_int8"](a, b, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
Ejemplo n.º 16
0
    def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M,
                          N, K, dtype):
        if dtype == "float16":
            pytest.xfail("float16 is not supported.")

        x = te.placeholder((x_batch, M, K), name="x")
        y = te.placeholder((y_batch, N, K), name="y")

        def get_ref_data():
            a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
            b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
            c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
            return (a_np, b_np, c_np)

        # get the test data
        a_np, b_np, c_np = get_ref_data()

        target_hexagon = tvm.target.hexagon("v68")
        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.batch_matmul
            fschedule = topi.hexagon.schedule_batch_matmul
            out = fcompute(x, y)
            s = fschedule([out])
            out_shape = out.shape

        func = tvm.build(
            s,
            [x, y, out],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name="batch_matmul",
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype),
                         dev)
        mod["batch_matmul"](a, b, c)

        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
Ejemplo n.º 17
0
def test_softmax(hexagon_session: Session, shape, dtype, softmax_operation):
    if dtype == "float16":
        pytest.xfail("float16 is not supported.")
    A = te.placeholder(shape, dtype=dtype, name="A")

    topi_op = configs[softmax_operation]["topi"]
    B = topi_op(A, axis=1)

    def get_ref_data(shape):
        ref_func = tvm.topi.testing.softmax_python
        a_np = np.random.uniform(size=shape).astype(dtype)

        if len(shape) == 2:
            b_np = ref_func(a_np)
        elif len(shape) == 4:
            _, c, h, w = a_np.shape
            a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
            b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
            b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)

        return a_np, b_np

    # get the test data
    a_np, b_np = get_ref_data(shape)

    target_hexagon = tvm.target.hexagon("v68")
    with tvm.target.Target(target_hexagon):
        fschedule = topi.hexagon.schedule_softmax
        s = fschedule(B)

    func = tvm.build(s, [A, B],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="softmax")
    mod = hexagon_session.load_module(func)

    dev = hexagon_session.device
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
    mod["softmax"](a, b)

    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
Ejemplo n.º 18
0
    def test_conv2d_nchw(
        self,
        hexagon_session: Session,
        batch,
        in_channel,
        in_size,
        num_filter,
        kernel,
        stride,
        padding,
        dtype,
        ref_data,
        dilation,
        add_bias,
        apply_relu,
    ):
        target_hexagon = tvm.target.hexagon("v68")

        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
            padding, (kernel, kernel))
        padding_sum = pad_top + pad_left + pad_bottom + pad_right

        a_np, w_np, b_np, c_np = ref_data

        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
        bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)

        if "int" in dtype:
            tol = {"atol": 0, "rtol": 0}
        elif dtype == "float32":
            tol = {"rtol": 1e-4, "atol": 2e-4}
        elif dtype == "float16":
            # A summation in float16 with a single accumulator very
            # quickly runs into large rounding errors.  At some point,
            # this tolerance should be schedule-dependent for to avoid
            # false negatives.
            num_values_summed = in_channel * kernel * kernel
            gap_size = np.nextafter(c_np.max(), np.inf,
                                    dtype=c_np.dtype) - c_np.max()
            tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}

        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.conv2d_nchw
            fschedule = topi.hexagon.schedule_conv2d_nchw
            C = fcompute(A, W, (stride, stride), padding, (dilation, dilation),
                         dtype)
            if add_bias:
                C = topi.add(C, bias)
            if apply_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
            dtype,
            batch,
            in_channel,
            in_size,
            num_filter,
            kernel,
            stride,
            padding_sum,
            dilation,
        )
        func = tvm.build(
            s,
            [A, W, bias, C],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name=func_name,
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)

        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        mod[func_name](a, w, b, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
Ejemplo n.º 19
0
def test_conv2d(hexagon_session: Session, aot_host_target, aot_target,
                usmp_enabled):
    """Try conv2d on AOT target with usmp_enabled and check for TVMBackendAllocWorkspace calls"""
    dtype = "float32"
    input_shape = (1, 8, 8, 3)
    w1_shape = (5, 5, 3, 1)
    w2_shape = (5, 5, 1, 3)
    data = relay.var("data", relay.TensorType(input_shape, dtype))
    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
    outpu1 = relay.nn.conv2d(
        data,
        weight1,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    output2 = relay.nn.conv2d(
        outpu1,
        weight2,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight1, weight2], output2)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2],
                                  w1_shape[3]).astype(dtype=dtype)
    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2],
                                  w2_shape[3]).astype(dtype=dtype)
    input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2],
                                input_shape[3]).astype(dtype=dtype)

    params = {"weight1": weight1_data, "weight2": weight2_data}
    inputs = {"data": input_data}

    with tvm.transform.PassContext(opt_level=3,
                                   config={"tir.usmp.enable": usmp_enabled}):
        lowered = tvm.relay.build(
            relay_mod,
            params=params,
            target=tvm.target.Target(aot_target, host=aot_host_target),
            runtime=Runtime("cpp"),
            executor=Executor("aot", {
                "unpacked-api": False,
                "interface-api": "packed"
            }),
        )

    assert is_tvm_backendallocworkspace_calls(lowered.lib) != usmp_enabled

    aot_mod = hexagon_session.get_executor_from_factory(lowered)
    aot_mod.set_input(**inputs)
    aot_mod.run()
    hexagon_output = aot_mod.get_output(0).numpy()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=Runtime("cpp"),
            executor=Executor("graph"),
        )

    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Ejemplo n.º 20
0
    def test_conv2d(
        self,
        hexagon_session: Session,
        batch,
        in_channel,
        in_size,
        num_filter,
        stride,
        padding,
        output_padding,
        random_seed,
    ):

        target_hexagon = tvm.target.hexagon("v68")

        in_height, in_width = in_size
        kernel_height, kernel_width = (1, 1)
        stride_height, stride_width = stride
        pad_top, pad_left, pad_bottom, pad_right = padding

        A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
        W = te.placeholder(
            (in_channel, num_filter, kernel_height, kernel_width), name="W")

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)
        dtype = A.dtype

        def get_ref_data():

            np.random.seed(random_seed)
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = tvm.topi.testing.conv2d_transpose_nchw_python(
                a_np, w_np, stride, padding, output_padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()

        fcompute_args = (
            A,
            W,
            [stride_height, stride_width],
            [pad_top, pad_left, pad_bottom, pad_right],
            A.dtype,
            output_padding,
        )

        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.conv2d_transpose_nchw
            fschedule = topi.hexagon.schedule_conv2d_transpose_nchw
            B = fcompute(*fcompute_args)
            C = topi.nn.relu(B)
            s1 = fschedule([B])
            s2 = fschedule([C])

            dev = hexagon_session.device

            a = tvm.nd.array(a_np, dev)
            w = tvm.nd.array(w_np, dev)
            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                             dev)
            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                             dev)

            func1 = tvm.build(
                s1, [A, W, B],
                tvm.target.Target(target_hexagon, host=target_hexagon))
            func2 = tvm.build(
                s2, [A, W, C],
                tvm.target.Target(target_hexagon, host=target_hexagon))

            mod1 = hexagon_session.load_module(func1)
            mod2 = hexagon_session.load_module(func2)

            mod1(a, w, b)
            mod2(a, w, c)
            tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
Ejemplo n.º 21
0
def test_graph_executor_multiple_conv2d(hexagon_session: Session):
    dtype = "float32"
    input_shape = (1, 8, 8, 3)
    w1_shape = (5, 5, 3, 1)
    w2_shape = (5, 5, 1, 3)
    data = relay.var("data", relay.TensorType(input_shape, dtype))
    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
    y1 = relay.nn.conv2d(
        data,
        weight1,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    y2 = relay.nn.conv2d(
        y1,
        weight2,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight1, weight2], y2)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    target_hexagon = tvm.target.hexagon("v68")
    runtime = Runtime("cpp")
    executor = Executor("graph")

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_hexagon, host=target_hexagon),
            runtime=runtime,
            executor=executor,
        )

    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2],
                                  w1_shape[3]).astype(dtype=dtype)
    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2],
                                  w2_shape[3]).astype(dtype=dtype)
    input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2],
                                input_shape[3]).astype(dtype=dtype)

    params = {"weight1": weight1_data, "weight2": weight2_data}
    inputs = {"data": input_data}

    graph_mod = hexagon_session.get_executor_from_factory(lowered)
    graph_mod.set_input(**params)
    graph_mod.run(**inputs)
    hexagon_output = graph_mod.get_output(0).numpy()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=runtime,
            executor=executor,
        )
    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Ejemplo n.º 22
0
def test_reduce_map(hexagon_session: Session, ref_data, in_shape, axis,
                    keepdims, reduce_type, dtype):
    in_npy, in_npy_map, out_npy = ref_data

    # Build the logic and compile the function
    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
    A1 = topi.sqrt(topi.exp(A))
    out_dtype = dtype
    if reduce_type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif reduce_type == "all":
        B = topi.all(A, axis=axis, keepdims=keepdims)
    elif reduce_type == "any":
        B = topi.any(A, axis=axis, keepdims=keepdims)
    elif reduce_type == "max":
        B = topi.max(A1, axis=axis, keepdims=keepdims)
    elif reduce_type == "min":
        B = topi.min(A1, axis=axis, keepdims=keepdims)
    elif reduce_type == "argmax":
        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    elif reduce_type == "argmin":
        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    else:
        raise NotImplementedError

    target_hexagon = tvm.target.hexagon("v68")
    with tvm.target.Target(target_hexagon):
        fschedule = topi.hexagon.schedule_reduce
        s = fschedule(B)

    func = tvm.build(s, [A, B],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name=reduce_type)
    mod = hexagon_session.load_module(func)

    dev = hexagon_session.device
    data_tvm = tvm.nd.array(in_npy, device=dev)
    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)

    mod[reduce_type](data_tvm, out_tvm)

    if reduce_type == "argmax" or reduce_type == "argmin":
        out_tvm_indices = out_tvm.numpy()
        if keepdims:
            out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
        if axis is None:
            out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
        else:
            other_indices = tuple(
                np.indices(in_shape[0:axis] + in_shape[(axis + 1):]))
            sel_indices = other_indices[0:axis] + (
                out_tvm_indices, ) + other_indices[axis:]
            out_tvm_val = in_npy_map[sel_indices]
        if reduce_type == "argmax":
            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis),
                                        1e-3, 1e-3)
        elif reduce_type == "argmin":
            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis),
                                        1e-3, 1e-3)
    else:
        tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
Ejemplo n.º 23
0
def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target):
    dtype = "float32"
    input_shape = (1, 128, 128, 3)
    w_shape = (5, 5, 3, 8)
    data = relay.var("data", relay.TensorType(input_shape, dtype))
    weight = relay.var("weight", relay.TensorType(w_shape, dtype))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    weight_data = np.random.rand(w_shape[0], w_shape[1], w_shape[2],
                                 w_shape[3]).astype(dtype=dtype)
    input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2],
                                input_shape[3]).astype(dtype=dtype)

    params = {"weight": weight_data}
    inputs = {"data": input_data}

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            params=params,
            target=tvm.target.Target(aot_target, host=aot_host_target),
            runtime=Runtime("cpp"),
            executor=Executor("aot", {
                "unpacked-api": False,
                "interface-api": "packed"
            }),
        )

    aot_mod = hexagon_session.get_executor_from_factory(lowered)
    aot_mod.set_input(**inputs)
    aot_mod.run()
    hexagon_output = aot_mod.get_output(0).numpy()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=Runtime("cpp"),
            executor=Executor("graph"),
        )

    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Ejemplo n.º 24
0
    def test_maxpool2d_nhwc(
        self,
        N,
        H,
        W,
        C,
        DTYPE,
        KERNEL,
        STRIDE,
        DILATION,
        PADDING,
        IO_TENSOR_MEM_SCOPE,
        hexagon_session: Session,
    ):
        keys_dict = {
            "basic_kernel": "max_pool2d",
            "sched_type": 1,
            "input_shape_4d": [N, H, W, C],
            "block_shape": [8, 8, 32],
            "DTYPE": DTYPE,
            "KERNEL": KERNEL,
            "STRIDE": STRIDE,
            "DILATION": DILATION,
            "PADDING": PADDING,
            "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
        }

        desc = bu.get_benchmark_decription(keys_dict)

        # Create the host-side directory for this benchmark run's files / logs...
        host_files_dir_name = bu.get_benchmark_id(keys_dict)
        host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name)
        os.mkdir(host_files_dir_path)

        keys_dict["host_files_dir_path"] = host_files_dir_path

        log_file_path = os.path.join(host_files_dir_path, "out.txt")
        with open(log_file_path, "w") as log_file:
            print(f"CONFIGURATION: {desc}")
            log_file.write(f"CONFIGURATION: {desc}\n")

            try:
                input_tensor_shape_4d = [N, H, W, C]
                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)

                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)

                output = topi.nn.pool2d(
                    data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
                )
                primfunc = te.create_prim_func([data, output])

                sch = tir.Schedule(primfunc, debug_mask="all")

                sch.transform_layout(
                    block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map
                )

                target_hexagon = tvm.target.hexagon("v69", link_params=True)
                # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon))
                built_module = tvm.build(
                    sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)
                )

                # Save a local copy of the Hexagon object code (in the form of a .so file)
                # to allow post-mortem inspection.
                host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
                built_module.save(host_dso_binary_path)
                print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")

                hexagon_mod = hexagon_session.load_module(built_module)

                # Generate the input tensor's data.
                # Note that we'll eventually need it in two different layouts:
                # (1) NHWC as an argument to testing.poolnd_python.
                # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
                # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
                a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)

                ref_output_4d = testing.poolnd_python(
                    a_numpy_4d.astype("int32"),
                    KERNEL,
                    STRIDE,
                    DILATION,
                    PADDING[0:2],
                    PADDING[2:],
                    pool_type="max",
                    dtype="int32",
                    layout="NHWC",
                ).astype(DTYPE)

                output_tensor_shape_4d = ref_output_4d.shape

                a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d)

                a_hexagon_7d = allocate_hexagon_array(
                    hexagon_session.device,
                    tensor_shape=input_tensor_shape_7d,
                    axis_separators=[4],
                    dtype=DTYPE,
                    mem_scope=IO_TENSOR_MEM_SCOPE,
                )

                c_hexagon_4d = allocate_hexagon_array(
                    hexagon_session.device,
                    tensor_shape=output_tensor_shape_4d,
                    axis_separators=[],
                    dtype=DTYPE,
                    mem_scope=IO_TENSOR_MEM_SCOPE,
                )

                a_hexagon_7d.copyfrom(a_numpy_7d)

                if DTYPE == "int8":
                    rel_tolerance = 0
                    abs_tolerance = 0
                else:
                    assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"

                # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
                # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)

                timer = hexagon_mod.time_evaluator(
                    "main", hexagon_session.device, number=10, repeat=1
                )
                timing_result = timer(a_hexagon_7d, c_hexagon_4d)

                try:
                    tvm.testing.assert_allclose(
                        ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
                    )
                except AssertionError as e:
                    raise bu.NumericalAccuracyException(str(e))

            except bu.NumericalAccuracyException as e:
                print()
                print(f"FAIL: Numerical accuracy error. See log file.")

                log_file.write("\n")
                log_file.write(f"FAIL: {e}\n")

                self.benchmark_table.record_fail(
                    **keys_dict, comments=f"Numerical accuracy error. See log file."
                )

            except bu.UnsupportedException as e:
                print()
                print(f"SKIP: {e}")

                log_file.write("\n")
                log_file.write(f"SKIP: {e}\n")

                self.benchmark_table.record_skip(
                    **keys_dict, comments=f"Unsupported configuration: {e}"
                )

            self.benchmark_table.record_success(timing_result, **keys_dict)
Ejemplo n.º 25
0
    def test_avg_pool2d_slice(
        self,
        stride,
        kernel,
        dtype,
        dilation,
        padding,
        count_include_pad,
        input_layout,
        output_layout,
        output_shape,
        input_shape,
        input_shape_padded,
        input_np,
        input_np_padded,
        transformed_input_np_padded,
        transformed_expected_output_np,
        expected_output_np,
        hexagon_session: Session,
    ):
        if hexagon_session._launcher._serial_number != "simulator":
            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928")

        target_hexagon = tvm.target.hexagon("v69")
        A = te.placeholder(input_shape_padded, name="A", dtype=dtype)

        M = sl.avg_pool2d_compute(A, output_shape, kernel, stride, dilation)

        # tir schedule
        tir_schedule = sl.avg_pool2d_STIR_schedule(M, A, output_layout, input_layout)
        sch = tir_schedule.mod

        input_axis_separator = [4]
        if output_layout == "nhwc-8h2w32c2w-2d":
            output_axis_separator = [4]
        elif output_layout == "n11c-1024c-2d":
            output_axis_separator = [4]
        else:
            raise RuntimeError(f"Unexpected layout '{output_layout}'")

        with tvm.transform.PassContext(opt_level=3):
            func = tvm.build(
                sch,
                [A, M],
                tvm.target.Target(target_hexagon, host=target_hexagon),
                name="avg_pool2d",
            )

        input_arr = allocate_hexagon_array(
            hexagon_session.device,
            data=transformed_input_np_padded,
            axis_separators=input_axis_separator,
            mem_scope="global.vtcm",
        )
        output_arr = allocate_hexagon_array(
            hexagon_session.device,
            transformed_expected_output_np.shape,
            dtype,
            axis_separators=output_axis_separator,
            mem_scope="global.vtcm",
        )

        mod = hexagon_session.load_module(func)
        mod(input_arr, output_arr)
        b, h, w, c = output_shape
        if output_layout == "nhwc-8h2w32c2w-2d":
            output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
        elif output_layout == "n11c-1024c-2d":
            output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
        else:
            raise RuntimeError(f"Unexpected layout '{output_layout}'")

        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3)
Ejemplo n.º 26
0
    def test_conv2d(
        self,
        hexagon_session: Session,
        in_dtype,
        out_dtype,
        layout,
        input_shape,
        filter_shape,
        scale_shape,
        shift_shape,
        use_scale_shift,
        apply_relu,
        batch,
        in_channel,
        channel_multiplier,
        kernel,
        stride,
        padding,
        dilation,
        ref_data,
    ):
        target_hexagon = tvm.target.hexagon("v68")

        # Transform the padding argument from 'str' to 'tuple' to
        # match the "workload" tuple in TopHub.  Which padding_args to
        # use for each layout chosen to reproduce previous behavior.
        if dilation == 1:
            padding_args = get_pad_tuple(padding, (kernel, kernel))
            padding_args_i = [0, 1, 2, 3] if layout == "NCHW" else [0, 1]
            padding_args = [padding_args[i] for i in padding_args_i]
        else:
            padding_args = padding

        # placeholder
        Input = te.placeholder(input_shape, name="Input", dtype=in_dtype)
        Filter = te.placeholder(filter_shape, name="Filter", dtype=in_dtype)
        Scale = te.placeholder(scale_shape, name="Scale", dtype=out_dtype)
        Shift = te.placeholder(shift_shape, name="Shift", dtype=out_dtype)

        if layout == "NCHW":
            topi_scale_shift = topi.nn.scale_shift_nchw
            fcompute_args = (Input, Filter, stride, padding_args, dilation,
                             out_dtype)

        elif layout == "NHWC":
            topi_scale_shift = topi.nn.scale_shift_nhwc
            fcompute_args = (Input, Filter, stride, padding_args, dilation,
                             out_dtype)

        elif layout == "NCHWc":
            topi_scale_shift = topi.nn.scale_shift_nchwc
            in_layout = "NCHW{}c".format(input_shape[-1])
            out_layout = "NCHW{}c".format(filter_shape[-1])
            fcompute_args = (
                Input,
                Filter,
                stride,
                padding,
                dilation,
                in_layout,
                out_layout,
                out_dtype,
            )

        with tvm.target.Target(target_hexagon):
            # Declare, build schedule
            if layout == "NCHW":
                fcompute = topi.nn.depthwise_conv2d_nchw
                fschedule = topi.hexagon.schedule_depthwise_conv2d_nchw
            elif layout == "NHWC":
                fcompute = topi.nn.depthwise_conv2d_nhwc
                fschedule = topi.hexagon.schedule_depthwise_conv2d_nhwc
            C = fcompute(*fcompute_args)
            if use_scale_shift:
                C = topi_scale_shift(C, Scale, Shift)
            if apply_relu:
                C = topi.nn.relu(C)

            s = fschedule([C])

            # Build and run
            f = tvm.build(
                s,
                [Input, Filter, Scale, Shift, C],
                tvm.target.Target(target_hexagon, host=target_hexagon),
            )
            mod = hexagon_session.load_module(f)

            input_np, filter_np, scale_np, shift_np, output_np = ref_data

            dev = hexagon_session.device
            input_tvm = tvm.nd.array(input_np, dev)
            filter_tvm = tvm.nd.array(filter_np, dev)
            scale_tvm = tvm.nd.array(scale_np, dev)
            shift_tvm = tvm.nd.array(shift_np, dev)
            output_tvm = tvm.nd.array(
                np.zeros(shape=get_const_tuple(C.shape), dtype=C.dtype),
                dev,
            )

            mod(input_tvm, filter_tvm, scale_tvm, shift_tvm, output_tvm)

            tol = {"rtol": 1e-4, "atol": 1e-5}
            tvm.testing.assert_allclose(output_np, output_tvm.numpy(), **tol)