Esempio n. 1
0
    def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype):
        if dtype == "float16":
            pytest.xfail("float16 is not supported.")

        x = te.placeholder((x_batch, M, K), name="x")
        y = te.placeholder((y_batch, N, K), name="y")

        def get_ref_data():
            a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
            b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
            c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
            return (a_np, b_np, c_np)

        # get the test data
        a_np, b_np, c_np = get_ref_data()

        target_hexagon = tvm.target.hexagon("v68")
        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.batch_matmul
            fschedule = topi.hexagon.schedule_batch_matmul
            out = fcompute(x, y)
            s = fschedule([out])
            out_shape = out.shape

        func = tvm.build(
            s,
            [x, y, out],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name="batch_matmul",
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev)
        mod["batch_matmul"](a, b, c)

        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
Esempio n. 2
0
    def check_target(target, dev):
        print("Running on target: %s" % target)
        fcompute, fschedule = tvm.topi.testing.dispatch(
            target, _conv3d_ncdhw_implement)
        with tvm.target.Target(target):
            C = fcompute(A, W, (stride, stride, stride), padding,
                         (dilation, dilation, dilation), dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4, atol=1e-6)
Esempio n. 3
0
def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
    target = tvm.target.Target(target)
    if target.kind.name == "vulkan" and dtype == "float64":
        # https://www.khronos.org/registry/SPIR-V/specs/1.0/GLSL.std.450.html
        pytest.xfail("Vulkan GLSL.std.450 does not support 64-bit floats")

    A = te.placeholder(shape, dtype=dtype, name="A")

    topi_op = configs[softmax_operation]["topi"]
    B = topi_op(A, axis=1)

    with tvm.target.Target(target):
        fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule)
        s = fschedule(B)

    a_np, b_np = ref_data

    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
    f = tvm.build(s, [A, B], target)
    f(a, b)
    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
Esempio n. 4
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        with tvm.target.Target(device):
            b = topi.vision.rcnn.roi_pool_nchw(a,
                                               rois,
                                               pooled_size=pooled_size,
                                               spatial_scale=spatial_scale)
            s_func = tvm.topi.testing.dispatch(device, _roi_pool_schedule)
            s = s_func(b)

        tvm_a = tvm.nd.array(a_np, ctx)
        tvm_rois = tvm.nd.array(rois_np, ctx)
        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype),
                             ctx=ctx)
        f = tvm.build(s, [a, rois, b], device)
        f(tvm_a, tvm_rois, tvm_b)
        tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-4)
    def check_device(device):
        ctx = tvm.context(device, 0)
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                device, _conv3d_ndhwc_tensorcore_implement
            )
            C = fcompute(A, W, stride, padding, dilation, "float32")
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
            )
            func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
Esempio n. 6
0
def conv1d_strategy(attrs, inputs, out_type, target):
    """conv1d generic strategy"""
    logger.warning("conv1d is not optimized for this platform.")
    layout = attrs.data_layout
    dilation = get_const_tuple(attrs.dilation)
    if dilation[0] < 1:
        raise ValueError("dilation should be a positive value")
    strategy = _op.OpStrategy()
    if layout == "NCW":
        strategy.add_implementation(
            wrap_compute_conv1d(topi.nn.conv1d_ncw),
            wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
            name="conv1d_ncw.generic",
        )
    elif layout == "NWC":
        strategy.add_implementation(
            wrap_compute_conv1d(topi.nn.conv1d_nwc),
            wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
            name="conv1d_nwc.generic",
        )
    else:
        raise ValueError("Unsupported conv1d layout {}".format(layout))
    return strategy
    def check_device(target, dev):
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                target, _batch_matmul_implement)
            out = fcompute(x, y)
            if not dynamic:
                s = fschedule([out])
                out_shape = out.shape
            else:
                s = te.create_schedule(out.op)
                out_shape = (batch_size, M, N)

            if debug:
                print(tvm.lower(s, [x, y, out], simple_mode=True))

        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype),
                         dev)
        f = tvm.build(s, [x, y, out], target, name="dense")
        f(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
Esempio n. 8
0
def verify_relu(m, n, dtype="float32"):
    A = te.placeholder((m, n), name="A", dtype=dtype)
    B = topi.nn.relu(A)

    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = a_np * (a_np > 0)

    def check_target(target, dev):
        if dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because %s does not have fp16 support" % target)
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            s = tvm.topi.testing.get_elemwise_schedule(target)(B)

        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
        foo = tvm.build(s, [A, B], target, name="relu")
        foo(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for target, dev in tvm.testing.enabled_targets():
        check_target(target, dev)
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        with tvm.target.Target(device):
            B = topi.nn.conv2d(A,
                               W,
                               stride,
                               padding,
                               dilation,
                               layout="NHWC",
                               out_dtype="int32")
            s = topi.x86.schedule_conv2d_nhwc_pack_int8([B])
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                         ctx)
        func = tvm.build(s, [A, W, B], device)
        func(a, w, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
Esempio n. 10
0
def resize1d_shape_func(attrs, inputs, _):
    """
    Shape function for resize2d op.
    """
    layout = attrs.layout
    width_axis = channel_axis = 1
    for i, letter in enumerate(layout):
        if letter == "N":
            batch_axis = i
        if letter == "W":
            width_axis = i
        if letter == "C":
            channel_axis = i
    size = get_const_tuple(attrs.size)
    return [
        _resize1d_shape_func(
            inputs[0],
            convert(size),
            convert(batch_axis),
            convert(width_axis),
            convert(channel_axis),
        )
    ]
Esempio n. 11
0
    def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K):
        dtype = "int8"
        out_dtype = "int8"
        assert x_batch == y_batch or x_batch == 1 or y_batch == 1
        x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
        y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)

        def get_ref_data():
            a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype)
            b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype)
            c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype=out_dtype)
            return (a_np, b_np, c_np)

        # get the test data
        a_np, b_np, c_np = get_ref_data()

        target_hexagon = tvm.target.hexagon("v68")
        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.batch_matmul
            fschedule = topi.hexagon.schedule_batch_matmul
            out = fcompute(x, y)
            s = fschedule([out])

        func = tvm.build(
            s,
            [x, y, out],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name="batch_matmul_int8",
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev)
        mod["batch_matmul_int8"](a, b, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
Esempio n. 12
0
def test_dense(hexagon_session, batch_size, in_dim, out_dim, use_bias,
               in_dtype, out_dtype, dense_ref_data):
    if in_dtype == "float16":
        pytest.xfail("float16 is not supported.")

    if "int" in in_dtype:
        tol = {"atol": 0, "rtol": 0}
    elif in_dtype == "float32":
        tol = {"rtol": 1e-5, "atol": 1e-5}

    A = te.placeholder((batch_size, in_dim), name="A", dtype=in_dtype)
    B = te.placeholder((out_dim, in_dim), name="B", dtype=in_dtype)
    C = te.placeholder((out_dim, ), name="C", dtype=out_dtype)

    a_np, b_np, c_np, d_np = dense_ref_data

    fcompute = topi.nn.dense
    fschedule = topi.hexagon.schedule_dense

    target_hexagon = tvm.target.hexagon("v68")
    with tvm.target.Target(target_hexagon):
        D = fcompute(A, B, C if use_bias else None, out_dtype)
        D = topi.nn.relu(D)
        s = fschedule([D])

    func = tvm.build(s, [A, B, C, D],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="dense")
    mod = hexagon_session.load_module(func)

    dev = hexagon_session.device
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(b_np, dev)
    c = tvm.nd.array(c_np, dev)
    d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
    mod["dense"](a, b, c, d)
    tvm.testing.assert_allclose(d.numpy(), d_np, **tol)
Esempio n. 13
0
    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                target, _group_conv2d_nchw_implement)
            C = fcompute(A, W, stride, padding, dtype, output_padding, groups)
            s = fschedule([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        func = tvm.build(
            s,
            [A, W, C],
            target,
            name="group_conv2d_transpose_%d_%d_%s_%d_%s_%s_%s_%s_%d" % (
                batch,
                in_channel,
                in_size,
                num_filter,
                kernel,
                stride,
                padding,
                output_padding,
                groups,
            ),
        )
        func(a, w, c)
        c = c.numpy()
        for measurement, reference in zip(c, c_np):
            tvm.testing.assert_allclose(measurement, reference, rtol=1e-5)
Esempio n. 14
0
def split_shape_func(attrs, inputs, _):
    """
    Shape function for split op.
    """
    if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)):
        indices_or_sections = get_const_int(attrs.indices_or_sections)
        assert indices_or_sections > 0, "Slice count must be > 0"
    else:
        indices_or_sections = list(get_const_tuple(attrs.indices_or_sections))
        assert sorted(indices_or_sections)[0] > 0 and indices_or_sections == sorted(
            indices_or_sections
        ), "split_indices must be sorted"

    axis = get_const_int(attrs.axis)

    if axis < 0:
        axis += get_const_int(inputs[0].shape[0])

    num_out = (
        indices_or_sections
        if isinstance(indices_or_sections, int)
        else len(indices_or_sections) + 1
    )

    param_is_indices = isinstance(indices_or_sections, int)
    if param_is_indices:
        indices_or_sections = [indices_or_sections]
    return [
        _split_shape_func(
            inputs[0],
            convert(i),
            convert(indices_or_sections),
            convert(param_is_indices),
            convert(axis),
        )
        for i in range(num_out)
    ]
Esempio n. 15
0
def verify_reorg(batch, in_size, in_channel, stride):
    """Verify reorg operator by comparing outputs from tvm and numpy implementation"""
    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
    B = topi.vision.reorg(A, stride)

    a_shape = get_const_tuple(A.shape)
    dtype = A.dtype

    def get_ref_data_reorg():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        b_np = tvm.topi.testing.reorg_python(a_np, stride)
        return a_np, b_np

    a_np, b_np = get_ref_data_reorg()

    def check_device(device):
        """Cheching devices is enabled or not"""
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            s_func = tvm.topi.testing.dispatch(device, _reorg_schedule)
            s = s_func([B])
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                         ctx)
        func = tvm.build(s, [A, B], device)
        func(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for device in ["llvm", "cuda"]:
        check_device(device)
Esempio n. 16
0
def crop_and_resize_func(attrs, inputs, _):
    """
    Shape function for crop_and_resize op.
    """
    layout = attrs.layout
    height_axis = width_axis = channel_axis = 1
    for i, letter in enumerate(layout):
        if letter == "H":
            height_axis = i
        if letter == "W":
            width_axis = i
        if letter == "C":
            channel_axis = i
    crop_size = get_const_tuple(attrs.crop_size)
    return [
        _crop_and_resize_func(
            inputs[0],
            inputs[1],
            convert(crop_size),
            convert(height_axis),
            convert(width_axis),
            convert(channel_axis),
        )
    ]
Esempio n. 17
0
def squeeze_shape_func(attrs, inputs, _):
    """
    Shape function for squeeze op.
    """
    axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis)
    keep_axes = []
    remove_axes = []
    if axis is not None:
        for i in range(inputs[0].shape[0].value):
            if i not in axis:
                keep_axes.append(i)
            else:
                remove_axes.append(i)

    # Due to current relay type system, it is possible even
    # a static kernel function needs shape function. To handle
    # this case, we allow axis to be None in squeeze shape func
    # for now.
    # TODO(kevinthesun): Enhance relay type system to avoid this.
    if keep_axes:
        out = _squeeze_shape_func(inputs[0], convert(keep_axes), convert(remove_axes))
    else:
        out = te.compute((), lambda *indices: 0)
    return [out]
Esempio n. 18
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        with tvm.target.Target(device):
            fcompute, fschedule = tvm.topi.testing.dispatch(device, _roi_align_implement)
            b = fcompute(
                a,
                rois,
                pooled_size=pooled_size,
                spatial_scale=spatial_scale,
                sample_ratio=sample_ratio,
                mode=mode,
            )
            s = fschedule(b)

        tvm_a = tvm.nd.array(a_np, ctx)
        tvm_rois = tvm.nd.array(rois_np, ctx)
        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx)
        f = tvm.build(s, [a, rois, b], device)
        f(tvm_a, tvm_rois, tvm_b)
        tvm_val = tvm_b.asnumpy()
        tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4)
Esempio n. 19
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skipping %s becuase it is not enabled" % device)
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NCHW", out_dtype=dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.generic.schedule_conv2d_nchw([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
def verify_conv3d_ndhwc(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
    devices="cuda",
):
    """Test the conv3d with tensorcore for ndhwc layout"""
    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(
        padding, (kernel, kernel, kernel))
    padding_sum = pad_front + pad_top + pad_left + pad_back + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_depth = in_height = in_width = in_size

    A = te.placeholder((batch, in_depth, in_height, in_width, in_channel),
                       name="A")
    W = te.placeholder((kernel, kernel, kernel, in_channel, num_filter),
                       name="W")
    bias = te.placeholder((1, 1, 1, 1, num_filter), name="bias")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv3d_ndhwc.verify_conv3d_ndhwc")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv3d_ndhwc_python(a_np, dw_np, stride,
                                                    padding)
        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        dev = tvm.device(device, 0)
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                device, _conv3d_ndhwc_tensorcore_implement)
            C = fcompute(A, W, stride, padding, dilation, "float32")
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)

    check_device(devices)
Esempio n. 21
0
def verify_pool_grad(n,
                     ic,
                     ih,
                     kh,
                     sh,
                     padding,
                     pool_type,
                     ceil_mode,
                     count_include_pad=True,
                     add_relu=False):
    """verify function of pool_grad"""
    iw = ih
    kw = kh
    sw = sh
    pt, pl, pb, pr = padding
    A = te.placeholder((n, ic, ih, iw), name="A")
    B = topi.nn.pool2d(
        A,
        kernel=[kh, kw],
        stride=[sh, sw],
        dilation=[1, 1],
        padding=padding,
        pool_type=pool_type,
        ceil_mode=ceil_mode,
        layout="NCHW",
        count_include_pad=count_include_pad,
    )
    dtype = A.dtype

    bshape = get_const_tuple(B.shape)
    ashape = get_const_tuple(A.shape)
    if ceil_mode:
        assert bshape[2] == int(
            math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1)
        assert bshape[3] == int(
            math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1)
    else:
        assert bshape[2] == int(
            math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
        assert bshape[3] == int(
            math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)
    OutGrad = te.placeholder(bshape, name="OutGrad")
    PoolGrad = topi.nn.pool_grad(
        OutGrad,
        A,
        kernel=[kh, kw],
        stride=[sh, sw],
        padding=padding,
        pool_type=pool_type,
        ceil_mode=ceil_mode,
        layout="NCHW",
        count_include_pad=count_include_pad,
    )
    if add_relu:
        PoolGrad = topi.nn.relu(PoolGrad)

    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
    out_grad_np = np.random.uniform(low=0.001, size=bshape).astype(dtype)
    pool_grad_np = tvm.topi.testing.pool_grad_nchw(
        a_np,
        out_grad_np,
        pool_size=(kh, kw),
        strides=(sh, sw),
        padding=padding,
        pool_type=pool_type,
        ceil_mode=ceil_mode,
        count_include_pad=count_include_pad,
    )
    if add_relu:
        pool_grad_np = np.maximum(pool_grad_np, 0.0)

    def check_target(target, dev):
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            s_func = tvm.topi.testing.dispatch(target, _pool_grad_schedule)
            s = s_func(PoolGrad)

        a = tvm.nd.array(a_np, dev)
        out_grad = tvm.nd.array(out_grad_np, dev)
        pool_grad = tvm.nd.array(
            np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), dev)
        f = tvm.build(s, [A, OutGrad, PoolGrad], target)
        f(a, out_grad, pool_grad)
        tvm.testing.assert_allclose(pool_grad.asnumpy(),
                                    pool_grad_np,
                                    rtol=1e-5)

    for target, dev in tvm.testing.enabled_targets():
        check_target(target, dev)
Esempio n. 22
0
def bitserial_dense(cfg,
                    data,
                    weight,
                    data_bits,
                    weight_bits,
                    pack_dtype="uint32",
                    out_dtype="int16",
                    unipolar=True):
    """Bitserial dense implementation. TODO: Why are these separate

    Parameters
    ----------
    data : tvm.te.Tensor
        2-D with shape [batch, in_dim]
    weight : tvm.te.Tensor
        2-D with shape [out_dim, in_dim] or
        3-D with shape [out_dim, weight_bits, in_dim]
    Returns
    -------
    output : tvm.te.Tensor
        2-D with shape [batch, out_dim]
    """
    data_packed = bitpack(data,
                          data_bits,
                          pack_axis=1,
                          bit_axis=1,
                          pack_type=pack_dtype)
    if len(weight.shape) == 2:
        weight_packed = bitpack(weight,
                                weight_bits,
                                pack_axis=1,
                                bit_axis=1,
                                pack_type=pack_dtype)
    else:
        weight_packed = weight
    Y, DB, K = get_const_tuple(data_packed.shape)
    X, WB, _ = get_const_tuple(weight_packed.shape)
    ######## Search space
    x, y = cfg.axis(X), cfg.axis(Y)
    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
    ko, ki = cfg.define_split("tile_k", k, num_outputs=2)
    yo, yi = cfg.define_split("tile_y", y, num_outputs=2)
    xo, xi = cfg.define_split("tile_x", x, num_outputs=2)

    cfg.define_reorder(
        "reorder_0",
        [yo, xo, ko, yi, wb, db, ki, xi],
        policy="candidate",
        candidate=[[yo, xo, ko, yi, wb, db, ki, xi],
                   [yo, xo, yi, ko, wb, db, ki, xi]],
    )

    cfg.define_annotate("ann_reduce", [db, wb], policy="try_unroll")
    cfg.define_annotate("ann_spatial", [yi, xi], policy="try_unroll_vec")

    ###### Compute rule
    VX = cfg["tile_x"].size[-1]

    wvshape = (X // VX, WB, VX, K)
    oshape = (Y, X)

    k = te.reduce_axis((0, K), name="k")
    db = te.reduce_axis((0, DB), name="db")
    wb = te.reduce_axis((0, WB), name="wb")

    # Tile data and weights
    weight_vec = te.compute(
        wvshape,
        lambda xo, wb, vx, k: weight_packed[xo * VX + vx][wb][k],
        name="weight_vec")

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    matmul_unipolar = te.compute(
        oshape,
        lambda i, j: te.sum(
            (tvm.tir.popcount(weight_vec[idxdiv(
                j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) - tvm.
             tir.popcount(~weight_vec[idxdiv(j, VX), wb,
                                      idxmod(j, VX), k] & data_packed[i, db, k]
                          )).astype(out_dtype) << (db + wb).astype(out_dtype),
            axis=[wb, db, k],
        ),
        tag="bitserial_dense_unipolar",
    )

    matmul = te.compute(
        oshape,
        lambda i, j: te.sum(
            tvm.tir.popcount(weight_vec[idxdiv(j, VX), wb,
                                        idxmod(j, VX), k] & data_packed[
                                            i, db, k]).astype(out_dtype) <<
            (db + wb).astype(out_dtype),
            axis=[wb, db, k],
        ),
        tag="bitserial_dense",
    )

    # binary ops
    cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype))

    if unipolar:
        return matmul_unipolar
    return matmul
Esempio n. 23
0
def conv2d_grad(orig, grad):
    """Gradient of conv2d"""
    attrs = orig.attrs
    data, weight = orig.args
    data_shape = get_const_tuple(data.checked_type.shape)
    weight_shape = get_const_tuple(weight.checked_type.shape)
    _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape)
    batch, in_channel, in_h, in_w = data_shape
    out_channel, _, filter_h, filter_w = weight_shape

    # infer output_padding
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
        get_const_tuple(attrs.padding), (filter_h, filter_w))
    stride_h, stride_w = get_const_tuple(attrs.strides)
    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
    out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
    out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w
    output_padding = (in_h - out_h, in_w - out_w)

    assert attrs.data_layout == "NCHW", "only support NCHW data layout"
    assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout"
    assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout"

    backward_data = _nn.conv2d_transpose(
        grad,
        weight,
        strides=attrs.strides,
        padding=attrs.padding,
        dilation=attrs.dilation,
        groups=attrs.groups,
        output_padding=output_padding,
    )
    grad = tile(grad, [1, in_channel // attrs.groups, 1, 1])
    grad = reshape(grad, [-1, 1, 0, 0])  # batch * oc * ic // groups, 1, oh, ow
    data = reshape(data, [1, -1, 0, 0])  # 1, batch * ic, ih, iw

    backward_weight = _nn.conv2d(
        data,
        grad,
        strides=attrs.dilation,
        padding=attrs.padding,
        dilation=attrs.strides,
        groups=in_channel * batch,
    )
    # infer shape of backward_weight
    padded_weight_grad_h = (in_h - (grad_h - 1) * stride_h - 1 + fpad_top +
                            fpad_bottom) // dilation_h + 1
    padded_weight_grad_w = (in_w - (grad_w - 1) * stride_w - 1 + fpad_left +
                            fpad_right) // dilation_w + 1
    backward_weight = reshape(
        backward_weight,
        [
            batch,
            in_channel // attrs.groups,
            out_channel,
            padded_weight_grad_h,
            padded_weight_grad_w,
        ],
    )
    backward_weight = _sum(backward_weight, axis=0)
    backward_weight = transpose(backward_weight, [1, 0, 2, 3])

    assert padded_weight_grad_h >= filter_h
    assert padded_weight_grad_w >= filter_w
    if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w:
        backward_weight = strided_slice(
            backward_weight,
            begin=[0, 0, 0, 0],
            end=[out_channel, in_channel // attrs.groups, filter_h, filter_w],
        )

    return [backward_data, backward_weight]
Esempio n. 24
0
def verify_pool(n,
                ic,
                ih,
                kh,
                sh,
                padding,
                pool_type,
                ceil_mode,
                count_include_pad=True):
    """verify function of pool"""
    iw = ih
    kw = kh
    sw = sh
    pt, pl, pb, pr = padding
    layout = "NCHW"
    A = te.placeholder((n, ic, ih, iw), name="A")
    B = topi.nn.pool(
        A,
        kernel=[kh, kw],
        stride=[sh, sw],
        padding=padding,
        pool_type=pool_type,
        ceil_mode=ceil_mode,
        layout="NCHW",
        count_include_pad=count_include_pad,
    )
    B = topi.nn.relu(B)
    dtype = A.dtype

    bshape = get_const_tuple(B.shape)
    ashape = get_const_tuple(A.shape)
    if ceil_mode:
        assert bshape[2] == int(
            math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1)
        assert bshape[3] == int(
            math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1)
    else:
        assert bshape[2] == int(
            math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
        assert bshape[3] == int(
            math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)

    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
    pad_np = np.zeros(shape=(n, ic, ih + pt + pb, iw + pl + pr)).astype(dtype)
    no_zero = (range(n), range(ic), (range(pt, ih + pt)), (range(pl, iw + pl)))
    pad_np[np.ix_(*no_zero)] = a_np
    _, oc, oh, ow = get_const_tuple(B.shape)
    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)

    if pool_type == "avg":
        for i in range(oh):
            for j in range(ow):
                if count_include_pad:
                    b_np[:, :, i, j] = np.mean(pad_np[:, :, i * sh:i * sh + kh,
                                                      j * sw:j * sw + kw],
                                               axis=(2, 3))
                else:
                    pad_count = np.sum(pad_np[:, :, i * sh:i * sh + kh,
                                              j * sw:j * sw + kw] > 0,
                                       axis=(2, 3))
                    b_np[:, :, i, j] = np.sum(
                        pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw],
                        axis=(2, 3)) / np.maximum(pad_count, 1)

    elif pool_type == "max":
        for i in range(oh):
            for j in range(ow):
                b_np[:, :, i, j] = np.max(pad_np[:, :, i * sh:i * sh + kh,
                                                 j * sw:j * sw + kw],
                                          axis=(2, 3))
    b_np = np.maximum(b_np, 0.0)

    def check_device(device, ctx):
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
            s = s_func(B, layout)

        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
        f = tvm.build(s, [A, B], device)
        f(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=2e-5, atol=1e-5)

    for device, ctx in tvm.testing.enabled_targets():
        check_device(device, ctx)
Esempio n. 25
0
def verify_group_conv2d_nchw(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation,
    groups,
    add_bias=False,
    add_relu=False,
):
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding,
           dilation, groups))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
    W = te.placeholder((num_filter, in_channel // groups, kernel, kernel),
                       name="W")
    bias = te.placeholder((num_filter, 1, 1), name="bias")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding,
                                                   groups).astype(dtype)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                target, _group_conv2d_nchw_implement)
            C = fcompute(A, W, stride, padding, dilation, groups, dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % (
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding,
                    dilation,
                    groups,
                ),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % (
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding,
                    dilation,
                    groups,
                ),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    for target in ["llvm", "cuda"]:
        check_target(target)
Esempio n. 26
0
    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding,
                                                  dilation, groups, dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % (
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding,
                    dilation,
                    groups,
                ),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % (
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding,
                    dilation,
                    groups,
                ),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
Esempio n. 27
0
def verify_group_conv2d_NCHWc_int8(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation,
    groups,
    add_bias=False,
    add_relu=False,
):
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding,
           dilation, groups))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width),
                       name="A",
                       dtype="int8")
    W = te.placeholder((num_filter, in_channel // groups, kernel, kernel),
                       name="W",
                       dtype="int8")
    bias = te.placeholder(
        (num_filter // oc_block_factor, 1, 1, oc_block_factor),
        name="bias",
        dtype="int8")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8"
             )
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding,
                                                   groups).astype(dtype)

        # convert to NCHWc
        _, _, out_height, out_width = c_np.shape
        c_np = c_np.reshape(
            (batch, num_filter // oc_block_factor, oc_block_factor, out_height,
             out_width)).transpose(0, 1, 3, 4, 2)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding,
                                                  dilation, groups, dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % (
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding,
                    dilation,
                    groups,
                ),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % (
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding,
                    dilation,
                    groups,
                ),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    for target in ["cuda"]:
        check_target(target)
Esempio n. 28
0
def conv2d_winograd_nhwc_auto_scheduler_test(N,
                                             H,
                                             W,
                                             CI,
                                             CO,
                                             kernel_size=3,
                                             stride=1,
                                             padding=0,
                                             dilation=1):
    tile_size = 4
    inputs = te.placeholder((N, H, W, CI), name="inputs")
    N, H, W, CI = get_const_tuple(inputs.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"

    KH = KW = kernel_size
    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
    assert HSTR == 1 and WSTR == 1 and KH == KW

    data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0),
                           name="data_pad")

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, _ = winograd_transform_matrices(m, r, "float32")

    H = (H + 2 * HPAD - KH) // HSTR + 1
    W = (W + 2 * WPAD - KW) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW
    kshape = (alpha, alpha, CI, CO)
    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")

    idxdiv = te.indexdiv
    idxmod = te.indexmod
    # pack input tile
    input_tile = te.compute(
        (alpha, alpha, P, CI),
        lambda eps, nu, p, ci: data_pad[idxdiv(p, (nH * nW))][idxmod(
            idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu][ci],
        name="input_tile",
    )

    # transform data
    r_a = te.reduce_axis((0, alpha), "r_a")
    r_b = te.reduce_axis((0, alpha), "r_b")
    data_pack = te.compute(
        (alpha, alpha, P, CI),
        lambda eps, nu, p, ci: te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps]
                                      * B[r_b][nu],
                                      axis=[r_a, r_b]),
        name="data_pack",
        attrs={
            "auto_scheduler_simplify_const_tensor_indices":
            ["eps", "nu", "r_a", "r_b"]
        },
    )

    # do batch gemm
    ci = te.reduce_axis((0, CI), name="ci")
    bgemm = te.compute(
        (alpha, alpha, P, CO),
        lambda eps, nu, p, co: te.sum(data_pack[eps][nu][p][ci] * kernel_pack[
            eps][nu][ci][co],
                                      axis=[ci]),
        name="bgemm",
    )

    # inverse transform
    r_a = te.reduce_axis((0, alpha), "r_a")
    r_b = te.reduce_axis((0, alpha), "r_b")
    inverse = te.compute(
        (m, m, P, CO),
        lambda vh, vw, p, co: te.sum(
            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name="inverse",
        attrs={
            "auto_scheduler_simplify_const_tensor_indices":
            ["vh", "vw", "r_a", "r_b"]
        },
    )

    # output
    output = te.compute(
        (N, H, W, CO),
        lambda n, h, w, co: inverse[idxmod(h, m),
                                    idxmod(w, m), n * nH * nW + idxdiv(h, m) *
                                    nW + idxdiv(w, m), co],
        name="conv2d_winograd",
    )

    return [inputs, kernel_pack, output]
def verify_deformable_conv2d_nchw(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    deformable_groups=1,
    groups=1,
):
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (
        batch,
        in_channel,
        in_size,
        num_filter,
        kernel,
        stride,
        padding,
        dilation,
        deformable_groups,
        groups,
    ))

    A = te.placeholder((batch, in_channel, in_size, in_size), name="A")
    out_size = (in_size -
                (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
    Offset = te.placeholder(
        (batch, deformable_groups * kernel * kernel * 2, out_size, out_size),
        name="offset")
    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
    bias = te.placeholder((num_filter, 1, 1), name="bias")

    a_shape = get_const_tuple(A.shape)
    offset_shape = get_const_tuple(Offset.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize(
        "topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nchw"
    )
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        offset_np = np.random.randn(*offset_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        c_np = tvm.topi.testing.deformable_conv2d_nchw_python(
            a_np, offset_np, w_np, stride, padding, dilation,
            deformable_groups, groups)

        return a_np, offset_np, w_np, c_np

    a_np, offset_np, w_np, c_np = get_ref_data()

    def check_device(device):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        fcompute, fschedule = tvm.topi.testing.dispatch(
            device, _deformable_conv2d_nchw_implement)
        with tvm.target.Target(device):
            C = fcompute(A, Offset, W, stride, padding, dilation,
                         deformable_groups, groups, dtype)
            s = fschedule([C])

            a = tvm.nd.array(a_np, dev)
            offset = tvm.nd.array(offset_np, dev)
            w = tvm.nd.array(w_np, dev)
            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev)

            func = tvm.build(s, [A, Offset, W, C], device)
            func(a, offset, w, c)
            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    for device in ["llvm", "cuda"]:
        check_device(device)
Esempio n. 30
0
def test_util():
    x = tvm.tir.const(100, "int32")
    assert utils.get_const_int(x) == 100
    assert utils.get_const_tuple((x, x)) == (100, 100)