Esempio n. 1
0
def np_conv(na, nw, padding, stride=1):
    batch, in_channel, in_height, in_width = na.shape
    _, num_filter, kernel_h, kernel_w = nw.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
    pad_h = pad_top + pad_bottom
    pad_w = pad_left + pad_right

    out_channel = num_filter
    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
    nb = np.zeros((batch, out_channel, out_height, out_width))
    for n in range(batch):
        for f in range(out_channel):
            for c in range(in_channel):
                if pad_h > 0 or pad_w > 0:
                    apad = np.zeros((in_height + pad_h, in_width + pad_w))
                    apad[pad_top : pad_top + in_height, pad_left : pad_left + in_width] = na[n, c]
                else:
                    apad = na[n, c]
                out = scipy.signal.convolve2d(apad, np.rot90(np.rot90(nw[f, c])), mode="valid")
                nb[n, f] += out[::stride, ::stride]
    return nb
Esempio n. 2
0
        def get_ref_data():
            out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype)
            input_np = np.random.uniform(size=in_shape).astype(dtype)
            dilated_out_grad_np = tvm.topi.testing.dilate_python(
                out_grad_np, [1, stride_h, stride_w, 1])

            pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
                [padding_h, padding_w], (filter_h, filter_w))
            padded_input_np = np.zeros(
                (batch, in_h + pad_top + pad_bottom,
                 in_w + pad_left + pad_right, in_channel))
            padded_input_np[:, pad_top:in_h + pad_top,
                            pad_left:in_w + pad_left, :] = input_np

            weight_grad_np = np.zeros(
                (filter_h, filter_w, in_channel, channel_multiplier))
            for c in range(in_channel):
                for m in range(channel_multiplier):
                    for b in range(batch):
                        weight_grad_np[:, :, c, m] += signal.convolve2d(
                            padded_input_np[b, :, :, c],
                            np.rot90(
                                dilated_out_grad_np[b, :, :,
                                                    c * channel_multiplier +
                                                    m % channel_multiplier],
                                2,
                            ),
                            mode="valid",
                        )[0:filter_h, 0:filter_w]
            return (out_grad_np, input_np, weight_grad_np)
Esempio n. 3
0
def conv2d_hwcn_python(a_np, w_np, stride, padding):
    """Convolution operator in HWCN layout.

    Parameters
    ----------
    a_np : numpy.ndarray
        4-D with shape [in_height, in_width, in_channel, batch]

    w_np : numpy.ndarray
        4-D with shape [filter_height, filter_width, in_channel, num_filter]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str or a list/tuple of 2 or 4 ints
        Padding size, or ['VALID', 'SAME'], or
        [pad_height, pad_width] for 2 ints, or
        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints

    Returns
    -------
    b_np : np.ndarray
        4-D with shape [out_height, out_width, out_channel, batch]
    """
    in_height, in_width, in_channel, batch = a_np.shape
    kernel_h, kernel_w, _, num_filter = w_np.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
    pad_h = pad_top + pad_bottom
    pad_w = pad_left + pad_right
    # compute the output shape
    out_channel = num_filter
    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
    # change the layout from HWCN to NCHW
    at = a_np.transpose((3, 2, 0, 1))
    wt = w_np.transpose((3, 2, 0, 1))
    bt = np.zeros((batch, out_channel, out_height, out_width))
    # computation
    for n in range(batch):
        for f in range(out_channel):
            for c in range(in_channel):
                if pad_h > 0 or pad_w > 0:
                    apad = np.zeros((in_height + pad_h, in_width + pad_w))
                    apad[pad_top:pad_top + in_height,
                         pad_left:pad_left + in_width] = at[n, c]
                else:
                    apad = at[n, c]
                out = scipy.signal.convolve2d(apad,
                                              np.rot90(np.rot90(wt[f, c])),
                                              mode="valid")
                bt[n, f] += out[::stride, ::stride]
    return bt.transpose((2, 3, 1, 0))
Esempio n. 4
0
def conv2d_grad(orig, grad):
    """Gradient of conv2d"""
    attrs = orig.attrs
    data, weight = orig.args
    data_shape = get_const_tuple(data.checked_type.shape)
    weight_shape = get_const_tuple(weight.checked_type.shape)
    _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape)
    _, _, in_h, in_w = data_shape
    _, _, filter_h, filter_w = weight_shape

    # infer output_padding
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
        get_const_tuple(attrs.padding), (filter_h, filter_w))
    stride_h, stride_w = get_const_tuple(attrs.strides)
    out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
    out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w
    output_padding = (in_h - out_h, in_w - out_w)

    assert attrs.data_layout == "NCHW", "only support NCHW data layout"
    assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout"
    assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout"

    if attrs.out_dtype in ["", None]:
        assert data.checked_type, "Call InferType first."
        out_dtype = data.checked_type.dtype
    else:
        out_dtype = attrs.out_dtype

    backward_data = _nn.conv2d_transpose(
        grad,
        weight,
        strides=attrs.strides,
        padding=attrs.padding,
        dilation=attrs.dilation,
        groups=attrs.groups,
        output_padding=output_padding,
        out_dtype=out_dtype,
    )

    backward_weight = _nn.conv2d_backward_weight(
        grad,
        data,
        strides=attrs.strides,
        padding=attrs.padding,
        dilation=attrs.dilation,
        groups=attrs.groups,
        channels=attrs.channels,
        kernel_size=(filter_h, filter_w),
        grad_layout=attrs.out_layout
        if attrs.out_layout else attrs.data_layout,
        data_layout=attrs.data_layout,
        kernel_layout=attrs.kernel_layout,
        out_dtype=out_dtype,
    )

    return [backward_data, backward_weight]
def compile_depthwise_NHWC_int8_arm(
    batch,
    in_channel,
    in_size,
    kernel,
    depth_multiplier,
    stride,
    padding,
    add_bias=False,
    dilation=1,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right

    in_height = in_width = in_size
    A = te.placeholder((batch, in_height, in_width, in_channel),
                       name="A",
                       dtype="int16")
    W = te.placeholder((kernel, kernel, in_channel, depth_multiplier),
                       name="W",
                       dtype="int16")
    bias = te.placeholder((in_channel * depth_multiplier, ),
                          name="bias",
                          dtype="int32")
    dtype = "int32"

    device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"
    compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc
    schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc

    if not tvm.testing.device_enabled(device):
        print("Skip because %s is not enabled" % device)
        return

    print("Compiling on arm AArch64 target: %s" % device)
    with tvm.target.Target(device):
        assert topi.arm_cpu.arm_utils.is_aarch64_arm(
        ), "AArch64 target not recognized"

        C = compute(A, W, (stride, stride), padding, (dilation, dilation),
                    dtype)
        if add_bias:
            C += bias
            ins_outs = [A, W, bias, C]
        else:
            ins_outs = [A, W, C]

        s = schedule([C])

        func = tvm.build(
            s,
            ins_outs,
            device,
            name="depthwise_conv2d",
        )
Esempio n. 6
0
def conv2d_transpose_packed(cfg,
                            data,
                            kernel,
                            strides,
                            padding,
                            out_dtype,
                            output_padding=(0, 0)):
    """Packed conv2d_transpose compute"""
    ishape = get_const_tuple(data.shape)
    kshape = get_const_tuple(kernel.shape)
    b, c_i, i_h, i_w, t_b, t_ci = ishape
    c_o, _, k_h, k_w, t_co, t_ci = kshape
    stride_h, stride_w = strides
    opad_h, opad_w = output_padding
    # FIXME(tmoreau89): currently IR pass breaks when output padding != (0,0)
    assert opad_h == 0 and opad_w == 0, "VTA does not support output padding for now"

    # derive padding parameters
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
        padding, (k_h, k_w))
    bpad_top = k_h - 1 - fpad_top
    bpad_bottom = k_h - 1 - fpad_bottom + opad_h
    bpad_left = k_w - 1 - fpad_left
    bpad_right = k_w - 1 - fpad_right + opad_w

    # padding stage
    dilated_input = topi.nn.dilate(data, [1, 1, stride_h, stride_w, 1, 1])
    data_pad = topi.nn.pad(dilated_input, [0, 0, bpad_top, bpad_left, 0, 0],
                           [0, 0, bpad_bottom, bpad_right, 0, 0])

    # convolution transpose stage
    out_h = (i_h - 1) * stride_h - fpad_top - fpad_bottom + k_h + opad_h
    out_w = (i_w - 1) * stride_w - fpad_left - fpad_right + k_w + opad_w
    oshape = (b, c_o, out_h, out_w, t_b, t_co)
    d_c = te.reduce_axis((0, c_i), name="d_c")
    d_h = te.reduce_axis((0, k_h), name="d_h")
    d_w = te.reduce_axis((0, k_w), name="d_w")
    d_ci = te.reduce_axis((0, t_ci), name="d_ci")

    out = te.compute(
        oshape,
        lambda i_n, i_c, i_h, i_w, j_n, j_c: te.sum(
            data_pad(i_n, d_c, i_h + d_h, i_w + d_w, j_n, d_ci).astype(
                out_dtype) * kernel[i_c, d_c, d_h, d_w, j_c, d_ci].astype(
                    out_dtype),
            axis=[d_c, d_h, d_w, d_ci],
        ),
        tag="packed_conv2d_transpose",
        name="res",
    )

    cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * kshape[2] *
                 kshape[3] * ishape[1] * ishape[-1])

    return out
def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
    """Depthwise convolution operator in NCHW layout.

    Parameters
    ----------
    input_np : numpy.ndarray
        4-D with shape [batch, in_channel, in_height, in_width]

    filter_np : numpy.ndarray
        4-D with shape [in_channel, channel_multiplier, filter_height, filter_width]

    stride : list / tuple of 2 ints
        [stride_height, stride_width]

    padding : str
        'VALID' or 'SAME'

    Returns
    -------
    output_np : np.ndarray
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, in_channel, in_height, in_width = input_np.shape
    _, channel_multiplier, filter_height, filter_width = filter_np.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (filter_height, filter_width))
    pad_h = pad_top + pad_bottom
    pad_w = pad_left + pad_right

    out_channel = in_channel * channel_multiplier
    out_height = (in_height - filter_height + pad_h) // stride_h + 1
    out_width = (in_width - filter_width + pad_w) // stride_w + 1
    output_np = np.zeros((batch, out_channel, out_height, out_width))

    for i in range(batch):
        for j in range(out_channel):
            apad = input_np[i, j // channel_multiplier, :, :]
            if pad_h or pad_w:
                apad = np.pad(apad, [(pad_top, pad_bottom),
                                     (pad_left, pad_right)])

            conv = _convolve2d(
                apad,
                np.rot90(filter_np[j // channel_multiplier,
                                   j % channel_multiplier, :, :],
                         k=2),
            )
            output_np[i, j, :, :] = conv[::stride_h, ::stride_w, ]

    return output_np
Esempio n. 8
0
def conv_bwd(N, CI, HI, WI, CO, HO, WO, KSIZE, stride, padding, dtype):
    strides = (stride, stride)
    shape_data = (N, CI, HI, WI)
    shape_weight = (CO, CI, KSIZE, KSIZE)
    shape_grad_output = (N, CO, HO, WO)
    # given tensor
    data = te.placeholder(shape_data, name="data", dtype=dtype)
    weight = te.placeholder(shape_weight, name="weight", dtype=dtype)
    grad_output = te.placeholder(shape_grad_output,
                                 name="grad_output",
                                 dtype=dtype)
    # grad_data
    out_h = (HO - 1) * strides[0] - 2 * padding + KSIZE
    out_w = (WO - 1) * strides[1] - 2 * padding + KSIZE
    output_padding = (HI - out_h, WI - out_w)
    grad_data = topi.nn.conv2d_transpose_nchw(grad_output, weight, strides,
                                              padding, dtype, output_padding)
    # grad_weight
    dilation_h, dilation_w = (1, 1)
    batch, in_channel, in_h, in_w = shape_data
    out_channel, _, filter_h, filter_w = shape_weight
    grad_output_tmp = topi.tile(grad_output, [1, in_channel, 1, 1])
    grad_output_tmp = topi.reshape(
        grad_output_tmp, [batch * in_channel * out_channel, 1, HO, WO])
    data_tmp = topi.reshape(data, [1, in_channel * batch, HI, WI])
    grad_weight = topi.nn.group_conv2d_nchw(data_tmp,
                                            grad_output_tmp,
                                            stride=(dilation_h, dilation_w),
                                            padding=padding,
                                            dilation=strides,
                                            groups=in_channel * batch,
                                            out_dtype=dtype)
    # infer shape of grad_weight
    _, _, grad_h, grad_w = shape_grad_output
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
        padding, (filter_h, filter_w))
    padded_weight_grad_h = (in_h - (grad_h - 1) * strides[0] - 1 + fpad_top +
                            fpad_bottom) // dilation_h + 1
    padded_weight_grad_w = (in_w - (grad_w - 1) * strides[1] - 1 + fpad_left +
                            fpad_right) // dilation_w + 1
    grad_weight = topi.reshape(grad_weight, [
        batch, in_channel, out_channel, padded_weight_grad_h,
        padded_weight_grad_w
    ])
    grad_weight = topi.sum(grad_weight, axis=0)
    grad_weight = topi.transpose(grad_weight, [1, 0, 2, 3])

    if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w:
        grad_weight = topi.strided_slice(
            grad_weight,
            begin=[0, 0, 0, 0],
            end=[out_channel, in_channel, filter_h, filter_w])
        return [data, weight, grad_output, grad_data, grad_weight]

    return [data, weight, grad_output, grad_data, grad_weight]
Esempio n. 9
0
def _conv2d_nchw_python(a_np, w_np, stride, padding):
    """Convolution operator in NCHW layout.

    Parameters
    ----------
    a_np : numpy.ndarray
        4-D with shape [batch, in_channel, in_height, in_width]

    w_np : numpy.ndarray
        4-D with shape [num_filter, in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str or a list/tuple of 2 or 4 ints
        Padding size, or ['VALID', 'SAME'], or
        [pad_height, pad_width] for 2 ints, or
        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints

    Returns
    -------
    b_np : np.ndarray
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, in_channel, in_height, in_width = a_np.shape
    num_filter, _, kernel_h, kernel_w = w_np.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
    pad_h = pad_top + pad_bottom
    pad_w = pad_left + pad_right
    # compute the output shape
    out_channel = num_filter
    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
    b_np = np.zeros((batch, out_channel, out_height, out_width))
    # computation
    for n in range(batch):
        for f in range(out_channel):
            for c in range(in_channel):
                if pad_h > 0 or pad_w > 0:
                    apad = np.zeros((in_height + pad_h, in_width + pad_w))
                    apad[pad_top:pad_top + in_height,
                         pad_left:pad_left + in_width] = a_np[n, c]
                else:
                    apad = a_np[n, c]
                out = scipy.signal.convolve2d(apad,
                                              np.rot90(np.rot90(w_np[f, c])),
                                              mode="valid")
                b_np[n, f] += out[::stride_h, ::stride_w]
    return b_np
Esempio n. 10
0
def make_ethosu_conv2d(
    ifm,
    ifm_channels,
    ofm_channels,
    kernel_shape,
    padding,
    strides,
    dilation,
    lut=relay.const([], dtype="int8"),
    activation="NONE",
    ifm_layout="NHWC",
    ofm_layout="NHWC",
    weight_dtype="int8",
    scale_bias_dtype="uint8",
    rounding_mode="TFL",
    upscale="NONE",
):
    # conv params
    weight_shape = (ofm_channels, kernel_shape[0], kernel_shape[1],
                    ifm_channels)
    padding = get_pad_tuple(padding, kernel_shape)

    scale_bias_data = generate_weights_data((weight_shape[0], 10),
                                            scale_bias_dtype)
    scale_bias = relay.const(scale_bias_data, dtype=scale_bias_dtype)
    weight_data = generate_weights_data(weight_shape, weight_dtype)
    weight = relay.const(weight_data, dtype=weight_dtype)
    conv = ethosu_ops.ethosu_conv2d(
        ifm,
        weight,
        scale_bias,
        lut=lut,
        ifm_scale=0.5,
        ifm_zero_point=10,
        weight_zero_point=12,
        ofm_scale=0.25,
        ofm_zero_point=14,
        kernel_shape=kernel_shape,
        ofm_channels=ofm_channels,
        strides=strides,
        padding=padding,
        dilation=dilation,
        activation=activation,
        clip_min=10 if activation == "CLIP" else 0,
        clip_max=100 if activation == "CLIP" else 0,
        rounding_mode=rounding_mode,
        upscale=upscale,
        ifm_layout=ifm_layout,
        ofm_layout=ofm_layout,
    )
    return conv
Esempio n. 11
0
def make_ethosu_depthwise_conv2d(
    ifm,
    channels,
    kernel_shape,
    padding,
    strides,
    dilation,
    activation="NONE",
    ifm_layout="NHWC",
    ofm_layout="NHWC",
    weight_dtype="int8",
    scale_bias_dtype="uint8",
    rounding_mode="TFL",
):
    # params
    weight_shape = (channels, kernel_shape[0], kernel_shape[1], 1)
    padding = get_pad_tuple(padding, kernel_shape)

    scale_bias_data = generate_weights_data((weight_shape[0], 10),
                                            scale_bias_dtype)
    scale_bias = relay.const(scale_bias_data, dtype=scale_bias_dtype)
    weight_data = generate_weights_data(weight_shape, weight_dtype)
    weight = relay.const(weight_data, dtype=weight_dtype)
    depthwise = ethosu_ops.ethosu_depthwise_conv2d(
        ifm,
        weight,
        scale_bias,
        lut=relay.const([], dtype="int8"),
        ifm_scale=0.6,
        ifm_zero_point=11,
        weight_zero_point=13,
        ofm_scale=0.26,
        ofm_zero_point=15,
        kernel_shape=kernel_shape,
        ofm_channels=channels,
        strides=strides,
        padding=padding,
        dilation=dilation,
        activation=activation,
        clip_min=15 if activation == "CLIP" else 0,
        clip_max=105 if activation == "CLIP" else 0,
        rounding_mode=rounding_mode,
        upscale="NONE",
        ifm_layout=ifm_layout,
        ofm_layout=ofm_layout,
    )
    return depthwise
Esempio n. 12
0
        def get_ref_data():
            out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            dilated_out_grad_np = tvm.topi.testing.dilate_python(
                out_grad_np, [1, stride_h, stride_w, 1])
            # padding params in forward propagation
            fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
                [padding_h, padding_w], (filter_h, filter_w))
            # padding params in backward propagation
            bpad_top = filter_h - 1 - fpad_top
            bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1)
            bpad_left = filter_w - 1 - fpad_left
            bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)

            padded_out_grad = np.zeros((
                batch,
                dilated_out_grad_np.shape[1] + bpad_top + bpad_bottom,
                dilated_out_grad_np.shape[2] + bpad_left + bpad_right,
                out_channel,
            ))
            padded_out_grad[:,
                            bpad_top:dilated_out_grad_np.shape[1] + bpad_top,
                            bpad_left:dilated_out_grad_np.shape[2] +
                            bpad_left, :, ] = dilated_out_grad_np

            in_grad_np = np.zeros((batch, in_h, in_w, in_channel))
            for b in range(batch):
                for c in range(in_channel):
                    for m in range(channel_multiplier):
                        in_grad_np[b, :, :, c] += signal.convolve2d(
                            padded_out_grad[b, :, :,
                                            c * channel_multiplier + m],
                            filter_np[:, :, c, m],
                            mode="valid",
                        )[0:in_h, 0:in_w]
            return (out_grad_np, filter_np, in_grad_np)
Esempio n. 13
0
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation,
                               out_dtype):
    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
    assert isinstance(strides, int) or len(strides) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(strides, int):
        stride_h = stride_w = strides
    else:
        stride_h, stride_w = strides

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch_size, in_height, in_width, in_channels = data.shape
    kernel_h, kernel_w, out_channels, _ = kernel.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name="padded_data")

    rc = te.reduce_axis((0, in_channels), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch_size, out_height, out_width, out_channels),
        lambda nn, yy, xx, ff: te.sum(
            padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx
                        * dilation_w, rc].astype(out_dtype) * kernel[
                            ry, rx, ff, rc].astype(out_dtype),
            axis=[ry, rx, rc],
        ),
        name="conv2d",
        tag="conv2d_nhwc",
    )

    ###########################
    # Config Space Definition #
    ###########################
    n, oh, ow, co = (
        cfg.axis(batch_size.value),
        cfg.axis(out_height.value),
        cfg.axis(out_width.value),
        cfg.axis(out_channels.value),
    )
    kh, kw, ci = (
        cfg.reduce_axis(kernel_h.value),
        cfg.reduce_axis(kernel_w.value),
        cfg.reduce_axis(in_channels.value),
    )

    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
    cio, cii = cfg.define_split(
        "tile_ci",
        ci,
        policy="factors",
        num_outputs=2,
        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
    )
    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)

    cfg.define_reorder(
        "reorder_0_simd",
        [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
        policy="candidate",
        candidate=[
            [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
            [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
            [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
            [n, kh, kw, oh, coo, owo, cio, owi, coi, cii],
        ],
    )

    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
    cfg.define_knob("unroll_explicit", [0, 1])

    if cfg.is_fallback:
        cfg.fallback_split("tile_ow", [-1, out_width.value])
        cfg.fallback_split("tile_ci", [-1, in_channels.value])
        cfg.fallback_split("tile_co", [-1, out_channels.value])

    return conv
Esempio n. 14
0
    def test_conv2d_nchw(
        self,
        hexagon_session: Session,
        batch,
        in_channel,
        in_size,
        num_filter,
        kernel,
        stride,
        padding,
        dtype,
        ref_data,
        dilation,
        add_bias,
        apply_relu,
    ):
        target_hexagon = tvm.target.hexagon("v68")

        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
            padding, (kernel, kernel))
        padding_sum = pad_top + pad_left + pad_bottom + pad_right

        a_np, w_np, b_np, c_np = ref_data

        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
        bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)

        if "int" in dtype:
            tol = {"atol": 0, "rtol": 0}
        elif dtype == "float32":
            tol = {"rtol": 1e-4, "atol": 2e-4}
        elif dtype == "float16":
            # A summation in float16 with a single accumulator very
            # quickly runs into large rounding errors.  At some point,
            # this tolerance should be schedule-dependent for to avoid
            # false negatives.
            num_values_summed = in_channel * kernel * kernel
            gap_size = np.nextafter(c_np.max(), np.inf,
                                    dtype=c_np.dtype) - c_np.max()
            tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}

        with tvm.target.Target(target_hexagon):
            fcompute = topi.nn.conv2d_nchw
            fschedule = topi.hexagon.schedule_conv2d_nchw
            C = fcompute(A, W, (stride, stride), padding, (dilation, dilation),
                         dtype)
            if add_bias:
                C = topi.add(C, bias)
            if apply_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
            dtype,
            batch,
            in_channel,
            in_size,
            num_filter,
            kernel,
            stride,
            padding_sum,
            dilation,
        )
        func = tvm.build(
            s,
            [A, W, bias, C],
            tvm.target.Target(target_hexagon, host=target_hexagon),
            name=func_name,
        )
        mod = hexagon_session.load_module(func)

        dev = hexagon_session.device
        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)

        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        mod[func_name](a, w, b, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
def depthwise_conv2d_with_workload_nchw(
    batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1
):
    in_width = in_height
    filter_channel = in_channel
    filter_width = filter_height
    stride_h = stride_w = stride

    if dilation == 1:
        # here we transform the padding argument from 'str' to  'tuple' ,
        # because we need this to match the "workload" tuple to the records in TopHub
        padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width))
        padding_args = (padt, padl, padb, padr)
    else:
        padding_args = padding

    # placeholder
    Input = te.placeholder((batch, in_channel, in_height, in_width), name="Input")
    Filter = te.placeholder(
        (filter_channel, channel_multiplier, filter_height, filter_width), name="Filter"
    )
    Scale = te.placeholder((in_channel * channel_multiplier,), name="Scale")
    Shift = te.placeholder((in_channel * channel_multiplier,), name="Shift")

    dtype = "float32"

    def check_target(target, dev):
        print("Running on target: %s" % target)

        impl_list = tvm.topi.testing.dispatch(target, _depthwise_conv2d_nchw_implement)[:]
        if target == "llvm" and channel_multiplier == 1 and dilation == 1:
            impl_list.append(
                (topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw)
            )

        for fcompute, fschedule in impl_list:
            with tvm.target.Target(target):
                # declare
                DepthwiseConv2d = fcompute(
                    Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype
                )
                ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
                Relu = topi.nn.relu(ScaleShift)
                # schedule
                s1 = fschedule(DepthwiseConv2d)
                s2 = fschedule(ScaleShift)
                s3 = fschedule(Relu)
            # build the kernels
            f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target)
            f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], target)
            f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], target)

            # Prepare pod type for test data closure
            input_shape = get_const_tuple(Input.shape)
            filter_shape = get_const_tuple(Filter.shape)
            scale_shape = get_const_tuple(Scale.shape)
            shift_shape = get_const_tuple(Shift.shape)
            scale_shift_shape = get_const_tuple(ScaleShift.shape)

            # Use memoize, pickle the test data for next time use.
            @memoize("topi.tests.test_topi_depthwise_conv2d.nchw")
            def get_ref_data():
                input_np = np.random.uniform(size=input_shape).astype(dtype)
                filter_np = np.random.uniform(size=filter_shape).astype(dtype)
                dilated_filter_np = tvm.topi.testing.dilate_python(
                    filter_np, (1, 1, dilation, dilation)
                )
                scale_np = np.random.uniform(size=scale_shape).astype(dtype)
                shift_np = np.random.uniform(size=shift_shape).astype(dtype)
                # correctness with scipy
                depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nchw(
                    input_np, dilated_filter_np, stride, padding
                )
                scale_shift_scipy = np.zeros(shape=scale_shift_shape)
                for c in range(in_channel * channel_multiplier):
                    scale_shift_scipy[:, c, :, :] = (
                        depthwise_conv2d_scipy[:, c, :, :] * scale_np[c] + shift_np[c]
                    )
                    relu_scipy = np.maximum(scale_shift_scipy, 0)
                return (
                    input_np,
                    filter_np,
                    scale_np,
                    shift_np,
                    depthwise_conv2d_scipy,
                    scale_shift_scipy,
                    relu_scipy,
                )

            # Get the test data
            (
                input_np,
                filter_np,
                scale_np,
                shift_np,
                depthwise_conv2d_scipy,
                scale_shift_scipy,
                relu_scipy,
            ) = get_ref_data()

            def verify_workload_padding():
                _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape)
                wkl = _get_workload(
                    Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype
                )

                # check if tile_ow candidates are the factors of the right output weight.
                with tvm.target.Target(target):
                    cfg = autotvm.get_config()
                    _fallback_schedule(cfg, wkl)
                    ow_tile = np.prod(cfg["tile_ow"].size)

                    tvm.testing.assert_allclose(ow_tile, out_width)

            if "llvm" in target:
                verify_workload_padding()

            input_tvm = tvm.nd.array(input_np, dev)
            filter_tvm = tvm.nd.array(filter_np, dev)
            scale_tvm = tvm.nd.array(scale_np, dev)
            shift_tvm = tvm.nd.array(shift_np, dev)
            depthwise_conv2d_tvm = tvm.nd.array(
                np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype),
                dev,
            )
            scale_shift_tvm = tvm.nd.array(
                np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev
            )
            relu_tvm = tvm.nd.array(
                np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev
            )
            # launch kernel 1 (depthwise_conv2d)
            timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1)
            tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
            # launch kernel 2 (depthwise_conv2d + scale_shift)
            timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1)
            tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
            # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
            timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1)
            tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
            tvm.testing.assert_allclose(
                depthwise_conv2d_tvm.numpy(), depthwise_conv2d_scipy, rtol=1e-5
            )
            tvm.testing.assert_allclose(scale_shift_tvm.numpy(), scale_shift_scipy, rtol=1e-5)
            tvm.testing.assert_allclose(relu_tvm.numpy(), relu_scipy, rtol=1e-5)

    for target, dev in tvm.testing.enabled_targets():
        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
            check_target(target, dev)
Esempio n. 16
0
    def test_conv2d_nchw(
        self,
        target,
        dev,
        batch,
        in_channel,
        in_size,
        num_filter,
        kernel,
        stride,
        padding,
        dtype,
        ref_data,
        dilation,
        add_bias,
        apply_relu,
    ):
        target = tvm.target.Target(target)
        is_cudnn_target = target.kind.name == "cuda" and "cudnn" in target.attrs.get("libs", [])

        if target.kind.name == "vulkan" and dtype == "float16":
            if not target.attrs.get("supports_float16", False) or not target.attrs.get(
                "supports_16bit_buffer", False
            ):
                pytest.xfail("Vulkan device does not support float16")

        if (
            target.kind.name == "cuda"
            and dtype == "float16"
            and not tvm.contrib.nvcc.have_fp16(dev.compute_version)
        ):
            pytest.xfail("CUDA float16 intrinsics not available")

        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
        padding_sum = pad_top + pad_left + pad_bottom + pad_right

        has_asymmetric_padding = (pad_top != pad_bottom) or (pad_left != pad_right)
        if is_cudnn_target and has_asymmetric_padding:
            pytest.xfail("CuDNN does not support asymmetric padding")

        a_np, w_np, b_np, c_np = ref_data

        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
        bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)

        if "int" in dtype:
            tol = {"atol": 0, "rtol": 0}
        elif dtype == "float32":
            tol = {"rtol": 1e-4, "atol": 2e-4}
        elif dtype == "float16":
            # A summation in float16 with a single accumulator very
            # quickly runs into large rounding errors.  At some point,
            # this tolerance should be schedule-dependent for to avoid
            # false negatives.
            num_values_summed = in_channel * kernel * kernel
            gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max()
            tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}

        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
            if is_cudnn_target:
                fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn
            else:
                fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(target)

            with target:
                if is_cudnn_target:
                    C = fcompute(
                        A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype
                    )
                else:
                    C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
                if add_bias:
                    C = topi.add(C, bias)
                if apply_relu:
                    C = topi.nn.relu(C)
                s = fschedule([C])

            a = tvm.nd.array(a_np, dev)
            w = tvm.nd.array(w_np, dev)
            b = tvm.nd.array(b_np, dev)

            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
                    dtype,
                    batch,
                    in_channel,
                    in_size,
                    num_filter,
                    kernel,
                    stride,
                    padding_sum,
                    dilation,
                ),
            )
            func(a, w, b, c)
            tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
Esempio n. 17
0
def verify_conv2d_hwnc(
    batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, dtype="int4"
):
    """Test the conv2d with tensorcore for hwnc layout"""
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print(
        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
    )
    # choose dtype from int4, int8
    assert dtype in ["int4", "int8"]

    in_height = in_width = in_size

    A = te.placeholder((in_height, in_width, batch, in_channel), name="A", dtype=dtype)
    W = te.placeholder((kernel, kernel, num_filter, in_channel), name="W", dtype=dtype)

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)

    @memoize("topi.tests.test_topi_conv2d_hwnc.verify_conv2d_hwnc")
    def get_ref_data():
        if dtype == "int4":
            a_np = np.random.randint(low=-8, high=7, size=a_shape).transpose((2, 0, 1, 3))
            w_np = np.random.randint(low=-8, high=7, size=w_shape)
            dw_np = topi.testing.dilate_python(
                w_np.transpose((0, 1, 3, 2)), (1, 1, dilation, dilation)
            )
        elif dtype == "int8":
            a_np = (
                np.random.randint(low=-128, high=127, size=a_shape)
                .transpose((2, 0, 1, 3))
                .astype(dtype)
            )
            w_np = np.random.randint(low=-128, high=127, size=w_shape).astype(dtype)
            dw_np = topi.testing.dilate_python(
                w_np.transpose((0, 1, 3, 2)), (1, 1, dilation, dilation)
            )

        c_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
        return a_np, w_np, c_np

    def convert_int32_into_int4(a_int32):
        """convert int32 values into int4
        Parameters
        ----------
        a_int32 : int

        Return
        ------
        a_int4 : int
        """
        I, J, K, L = a_int32.shape
        a_int4 = np.zeros(shape=(I, J, K, L // 8), dtype=np.int32)
        for i in range(I):
            for j in range(J):
                for k in range(K):
                    for l in range(L // 8):
                        for m in range(min(8, L - l * 8)):
                            a_int4[i, j, k, l] = a_int4[i, j, k, l] | (
                                (a_int32[i, j, k, l * 8 + m] & 0xF) << ((7 - m) * 4)
                            )
        return a_int4

    a_np, w_np, c_np = get_ref_data()
    if dtype == "int4":
        a_np = convert_int32_into_int4(a_np)
        w_np = convert_int32_into_int4(w_np)

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if not nvcc.have_tensorcore(dev.compute_version):
            print("skip because gpu does not support Tensor Cores")
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement)
            C = fcompute(A, W, stride, padding, dilation, dtype, "int32")
            s = fschedule([C])

        a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev)
        w = tvm.nd.array(w_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)

        func = tvm.build(
            s,
            [A, W, C],
            target,
            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
        )
        func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol)

    check_target("cuda")
Esempio n. 18
0
def compile_conv2d_NHWC_gemm_int8_arm(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size
    A = te.placeholder((batch, in_height, in_width, in_channel),
                       name="A",
                       dtype="int8")
    W = te.placeholder((kernel, kernel, in_channel, num_filter),
                       name="W",
                       dtype="int8")
    bias = te.placeholder((num_filter, ), name="bias", dtype="int8")
    dtype = "int32"
    devices = [
        (
            "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
        ),
        (
            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
        ),
        (
            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
            topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
            topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
        ),
        # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension
        # (
        #   "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
        #   topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
        #   topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
        # ),
    ]

    for device_tuple in devices:
        target = device_tuple[0]
        compute = device_tuple[1]
        schedule = device_tuple[2]

        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Compiling on arm AArch64 target: %s" % target)
        with tvm.target.Target(target):
            assert is_aarch64_arm(), "AArch64 target not recognized"

            C = compute(A, W, (stride, stride), padding, (dilation, dilation),
                        dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = schedule([C])

        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
def deformable_conv2d_nchw_python(a_np, offset_np, w_np, stride, padding,
                                  dilation, deformable_groups, groups):
    """Deformable convolution operator in NCHW layout.

    Parameters
    ----------
    a_np : numpy.ndarray
        4-D with shape [batch, in_channel, in_height, in_width]

    offset_np : numpy.ndarray
        4-D with shape [batch, deformable_groups * filter_height * filter_width * 2,
                        out_height, out_width]

    w_np : numpy.ndarray
        4-D with shape [num_filter, in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str or a list/tuple of 2 or 4 ints
        Padding size, or ['VALID', 'SAME'], or
        [pad_height, pad_width] for 2 ints, or
        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints

    dilation : int or a list/tuple of two ints
        Dilation size, or [dilate_height, dilate_width]

    deformable_groups : int
        Number of deformable groups

    groups : int
        Number of groups

    Returns
    -------
    b_np : np.ndarray
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, in_channel, in_height, in_width = a_np.shape
    out_channel, _, kernel_h, kernel_w = w_np.shape
    out_height, out_width = offset_np.shape[-2:]
    dtype = a_np.dtype
    ic_per_dgroup = in_channel // deformable_groups
    assert groups == 1, "deformable_conv2d_nchw_python does not support groups > 1"

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, _, _ = get_pad_tuple(padding, (kernel_h, kernel_w))

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    def _bilinear(n, c, h, w):
        low_h, low_w = int(h), int(w)
        high_h = min(low_h + 1, in_height - 1)
        high_w = min(low_w + 1, in_width - 1)
        y_lerp = h - low_h
        x_lerp = w - low_w

        bottom = (1 - x_lerp) * a_np[n, c, low_h, low_w] + x_lerp * a_np[
            n, c, low_h, high_w]
        top = (1 - x_lerp) * a_np[n, c, high_h,
                                  low_w] + x_lerp * a_np[n, c, high_h, high_w]
        return (1 - y_lerp) * bottom + y_lerp * top

    a_deform = np.zeros(
        (batch, in_channel, out_height, out_width, kernel_h, kernel_w),
        dtype=dtype)
    for n, h, w in itertools.product(range(batch), range(out_height),
                                     range(out_width)):
        offset = offset_np[n, :, h, w].reshape(deformable_groups, kernel_h,
                                               kernel_w, 2)
        in_h = h * stride_h - pad_top
        in_w = w * stride_w - pad_left

        index_h_base, index_w_base = np.meshgrid(
            np.arange(in_h,
                      in_h + kernel_h * dilation_h,
                      dilation_h,
                      dtype=offset_np.dtype),
            np.arange(in_w,
                      in_w + kernel_w * dilation_w,
                      dilation_w,
                      dtype=offset_np.dtype),
            indexing="ij",
        )

        for c, kh, kw in itertools.product(range(in_channel), range(kernel_h),
                                           range(kernel_w)):
            dg = c // ic_per_dgroup
            index_h = index_h_base + offset[dg, ..., 0]
            index_w = index_w_base + offset[dg, ..., 1]

            y, x = index_h[kh, kw], index_w[kh, kw]
            if y < 0 or y >= in_height or x < 0 or x >= in_width:
                continue
            a_deform[n, c, h, w, kh, kw] = _bilinear(n, c, y, x)

    b_np = np.zeros((batch, out_channel, out_height, out_width), dtype=dtype)
    for n, c, f, h, w in itertools.product(range(batch), range(in_channel),
                                           range(out_channel),
                                           range(out_height),
                                           range(out_width)):
        b_np[n, f, h, w] += np.tensordot(a_deform[n, c, h, w], w_np[f, c])

    return b_np
Esempio n. 20
0
def verify_conv2d_NHWC_gemm_int8(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_height, in_width, in_channel),
                       name="A",
                       dtype="int8")
    W = te.placeholder((kernel, kernel, in_channel, num_filter),
                       name="W",
                       dtype="int8")
    bias = te.placeholder((num_filter, ), name="bias", dtype="int8")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (dilation, dilation, 1, 1))
        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride,
                                                   padding).astype(dtype)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
                A, W, (stride, stride), padding, (dilation, dilation), dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    check_target("llvm")
Esempio n. 21
0
def verify_conv2d_NCHWc_int8(
    in_dtype,
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width),
                       name="A",
                       dtype=in_dtype)
    W = te.placeholder((num_filter, in_channel, kernel, kernel),
                       name="W",
                       dtype=in_dtype)

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    dtype = A.dtype
    out_dtype = "int32" if in_dtype == "int8" else "uint32"
    lo = -128 if in_dtype == "int8" else 0
    hi = 127 if in_dtype == "int8" else 255

    def check_target(target, compute, schedule, oc_block_factor, build_only):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        bias = te.placeholder(
            (num_filter // oc_block_factor, 1, 1, oc_block_factor),
            name="bias",
            dtype=out_dtype)
        bias_shape = get_const_tuple(bias.shape)

        @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
        def get_ref_data():
            a_np = np.random.randint(low=lo, high=hi,
                                     size=a_shape).astype(out_dtype)
            w_np = np.random.randint(low=lo, high=hi,
                                     size=w_shape).astype(out_dtype)
            b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
            dw_np = tvm.topi.testing.dilate_python(w_np,
                                                   (1, 1, dilation, dilation))
            c_np = tvm.topi.testing.conv2d_nchw_python(
                a_np, dw_np, stride, padding).astype(out_dtype)

            # convert to NCHWc
            _, _, out_height, out_width = c_np.shape
            c_np = c_np.reshape(
                (batch, num_filter // oc_block_factor, oc_block_factor,
                 out_height, out_width)).transpose(0, 1, 3, 4, 2)

            if add_bias:
                b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
                c_np += b_np
            if add_relu:
                c_np = np.maximum(c_np, 0)

            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()

        with tvm.target.Target(target):
            C = compute(
                A,
                W,
                (stride, stride),
                padding,
                (dilation, dilation),
                "NCHW",
                "NCHW",
                out_dtype,
            )
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = schedule([C])

        a = tvm.nd.array(a_np.astype(dtype), dev)
        w = tvm.nd.array(w_np.astype(dtype), dev)
        b = tvm.nd.array(b_np.astype(out_dtype), dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)

        if add_bias:
            compile_args = [A, W, bias, C]
            run_args = [a, w, b, c]
        else:
            compile_args = [A, W, C]
            run_args = [a, w, c]

        func = tvm.build(
            s,
            compile_args,
            target,
            name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
            (batch, in_channel, in_size, num_filter, kernel, stride,
             padding_sum, dilation),
        )

        if build_only:
            return

        print("Running on target: %s" % target)

        func(*run_args)

        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    targets = [
        (
            "cuda",
            lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(
                a, w, s, p, d, l, o),
            topi.cuda.schedule_conv2d_NCHWc_int8,
            4,
            False,
        ),
        # Disable on CI since it does not support spirv int8 dot product
        # (
        #     "vulkan -from_device=0",
        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
        #     topi.cuda.schedule_conv2d_NCHWc_int8,
        #     4,
        #     False,
        # ),
    ]

    build_only_aarch64 = platform.machine() != "aarch64"

    targets.append((
        "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
        topi.arm_cpu.conv2d_NCHWc_int8,
        topi.arm_cpu.schedule_conv2d_NCHWc_int8,
        8,
        build_only_aarch64,
    ))

    if in_dtype == "int8":
        targets += [
            (
                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
                topi.arm_cpu.conv2d_NCHWc_int8,
                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
                8,
                build_only_aarch64,
            ),
            (
                "rocm -mattr=+dotprod",
                lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(
                    a, w, s, p, d, l, o),
                topi.cuda.schedule_conv2d_NCHWc_int8,
                4,
                False,
            ),
        ]

    for target, compute, schedule, oc_block_factor, build_only in targets:
        check_target(target, compute, schedule, oc_block_factor, build_only)
def depthwise_conv2d_with_workload_nhwc(batch,
                                        in_channel,
                                        in_height,
                                        channel_multiplier,
                                        filter_height,
                                        stride_h,
                                        padding,
                                        dilation=1):
    in_width = in_height
    filter_channel = in_channel
    filter_width = filter_height
    stride_w = stride_h

    if dilation == 1:
        # here we transform the padding argument from 'str' to  'tuple' ,
        # because we need this to match the "workload" tuple to the records in TopHub
        pad_h, pad_w, _, _ = get_pad_tuple(padding,
                                           (filter_height, filter_width))
        padding_args = (pad_h, pad_w)
    else:
        padding_args = padding

    # placeholder
    Input = te.placeholder((batch, in_height, in_width, in_channel),
                           name="Input")
    Filter = te.placeholder(
        (filter_height, filter_width, filter_channel, channel_multiplier),
        name="Filter")
    Scale = te.placeholder((in_channel * channel_multiplier, ), name="Scale")
    Shift = te.placeholder((in_channel * channel_multiplier, ), name="Shift")

    dtype = "float32"

    def check_device(device, ctx):
        print("Running on target: %s" % device)

        fcompute, fschedule = tvm.topi.testing.dispatch(
            device, _depthwise_conv2d_nhwc_implement)
        with tvm.target.Target(device):
            # declare
            DepthwiseConv2d = fcompute(Input, Filter, (stride_h, stride_w),
                                       padding_args, dilation, dtype)
            ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale,
                                                  Shift)
            Relu = topi.nn.relu(ScaleShift)
            # schedule
            s1 = fschedule(DepthwiseConv2d)
            s2 = fschedule(ScaleShift)
            s3 = fschedule(Relu)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

        # Prepare pod type for test data closure
        input_shape = get_const_tuple(Input.shape)
        filter_shape = get_const_tuple(Filter.shape)
        scale_shape = get_const_tuple(Scale.shape)
        shift_shape = get_const_tuple(Shift.shape)
        scale_shift_shape = get_const_tuple(ScaleShift.shape)

        # Use memoize, pickle the test data for next time use.
        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc.v2")
        def get_ref_data():
            input_np = np.random.uniform(size=input_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            dilated_filter_np = tvm.topi.testing.dilate_python(
                filter_np, (dilation, dilation, 1, 1))
            scale_np = np.random.uniform(size=scale_shape).astype(dtype)
            shift_np = np.random.uniform(size=shift_shape).astype(dtype)
            # correctness with scipy
            depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nhwc(
                input_np,
                dilated_filter_np,
                stride=[stride_h, stride_w],
                padding=padding)
            scale_shift_scipy = np.zeros(shape=scale_shift_shape)
            for c in range(in_channel * channel_multiplier):
                scale_shift_scipy[:, :, :, c] = (
                    depthwise_conv2d_scipy[:, :, :, c] * scale_np[c] +
                    shift_np[c])
                relu_scipy = np.maximum(scale_shift_scipy, 0)
            return (
                input_np,
                filter_np,
                scale_np,
                shift_np,
                depthwise_conv2d_scipy,
                scale_shift_scipy,
                relu_scipy,
            )

        # Get the test data
        (
            input_np,
            filter_np,
            scale_np,
            shift_np,
            depthwise_conv2d_scipy,
            scale_shift_scipy,
            relu_scipy,
        ) = get_ref_data()

        # prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(ScaleShift.shape),
                     dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # launch kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          scale_shift_tvm).mean
        # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          relu_tvm).mean
        relu_scipy = np.maximum(scale_shift_scipy, 0)
        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                    depthwise_conv2d_scipy,
                                    rtol=1e-5)
        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(),
                                    scale_shift_scipy,
                                    rtol=1e-5)
        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

    for device, ctx in tvm.testing.enabled_targets():
        with autotvm.tophub.context(
                device):  # load tophub pre-tuned parameters
            check_device(device, ctx)
Esempio n. 23
0
def verify_conv2d_NCHWc_int8(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width),
                       name="A",
                       dtype="int8")
    W = te.placeholder((num_filter, in_channel, kernel, kernel),
                       name="W",
                       dtype="int8")
    bias = te.placeholder(
        (num_filter // oc_block_factor, 1, 1, oc_block_factor),
        name="bias",
        dtype="int8")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding).astype(dtype)

        # convert to NCHWc
        _, _, out_height, out_width = c_np.shape
        c_np = c_np.reshape(
            (batch, num_filter // oc_block_factor, oc_block_factor, out_height,
             out_width)).transpose(0, 1, 3, 4, 2)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.cuda.conv2d_NCHWc_int8(A, W, (stride, stride), padding,
                                            (dilation, dilation), "NCHW",
                                            dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.cuda.schedule_conv2d_NCHWc_int8([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    for target in ["cuda"]:
        check_target(target)
Esempio n. 24
0
def verify_conv2d_nchw(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
    use_cudnn=False,
):

    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
    bias = te.placeholder((num_filter, 1, 1), name="bias")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding)
        if add_bias:
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def verify_workload_padding():
        _, _, out_height, out_width = get_const_tuple(c_np.shape)
        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)

        # check if tile_ow candidates are the factors of the right output weight.
        cfg = autotvm.get_config()
        _fallback_schedule(cfg, wkl)
        ow_tile = np.prod(cfg["tile_ow"].size)

        tvm.testing.assert_allclose(ow_tile, out_width)

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Running on target: %s" % target)

        if "cudnn" in target:
            fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn
        else:
            fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(
                target)

        with tvm.target.Target(target):
            if "cudnn" in target:
                C = fcompute(A, W, (stride, stride), padding,
                             (dilation, dilation), 1, "NCHW", dtype)
            else:
                C = fcompute(A, W, (stride, stride), padding,
                             (dilation, dilation), dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

            if "llvm" in target:
                verify_workload_padding()

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)

        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4)

    for target, dev in tvm.testing.enabled_targets():
        with autotvm.tophub.context(
                target):  # load tophub pre-tuned parameters
            check_target(target)

    if use_cudnn:
        check_target("cuda -model=unknown -libs=cudnn")
def _conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding):
    """Transposed convolution operator in NCHW layout.

    Parameters
    ----------
    a_np : numpy.ndarray
        4-D with shape [batch, in_channel, in_height, in_width]

    w_np : numpy.ndarray
        4-D with shape [in_channel, num_filter, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    output_padding : int or a list/tuple of two ints
        Use to disambiguate the output shape.

    Returns
    -------
    b_np : np.ndarray
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, in_c, in_h, in_w = a_np.shape
    _, out_c, filter_h, filter_w = w_np.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride
    if isinstance(output_padding, int):
        opad_h = opad_w = output_padding
    else:
        opad_h, opad_w = output_padding
    assert opad_h < stride_h and opad_w < stride_w
    # dilate stage
    dilated_a_np = tvm.topi.testing.dilate_python(a_np,
                                                  [1, 1, stride_h, stride_w])
    # padding stage
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
        padding, (filter_h, filter_w))
    bpad_top = filter_h - 1 - fpad_top
    bpad_bottom = filter_h - 1 - fpad_bottom + opad_h
    bpad_left = filter_w - 1 - fpad_left
    bpad_right = filter_w - 1 - fpad_right + opad_w
    padded_a_np = np.zeros((
        batch,
        in_c,
        dilated_a_np.shape[2] + bpad_top + bpad_bottom,
        dilated_a_np.shape[3] + bpad_left + bpad_right,
    ))
    padded_a_np[:, :, bpad_top:dilated_a_np.shape[2] + bpad_top,
                bpad_left:dilated_a_np.shape[3] + bpad_left, ] = dilated_a_np
    # convolution stage
    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h + opad_h
    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w + opad_w
    b_np = np.zeros((batch, out_c, out_h, out_w))
    for n in range(batch):
        for f in range(out_c):
            for c in range(in_c):
                out = scipy.signal.convolve2d(padded_a_np[n, c],
                                              w_np[c, f],
                                              mode="valid")
                b_np[n, f] += out
    return b_np
Esempio n. 26
0
def verify_conv2d_nchw_int8(
    in_dtype,
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width),
                       name="A",
                       dtype=in_dtype)
    W = te.placeholder((num_filter, in_channel, kernel, kernel),
                       name="W",
                       dtype=in_dtype)
    bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding).astype(dtype)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def verify_workload_padding():
        _, _, out_height, out_width = get_const_tuple(c_np.shape)
        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)

        # for testing functionality,
        # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
        # regardless of the performance.
        int32_lanes, num_int8_elements = num_filter, in_channel

        # check if tile_ow candidates are the factors of the right output weight.
        cfg = autotvm.get_config()
        fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes,
                                          num_int8_elements)
        ow_tile = np.prod(cfg["tile_ow"].size)

        tvm.testing.assert_allclose(ow_tile, out_width)

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.cuda.conv2d_nchw_int8(A, W, (stride, stride), padding,
                                           (dilation, dilation), dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.cuda.schedule_conv2d_nchw_int8([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)

    verify_workload_padding()

    for target in ["cuda"]:
        check_target(target)
Esempio n. 27
0
def conv2d_grad(orig, grad):
    """Gradient of conv2d"""
    attrs = orig.attrs
    data, weight = orig.args
    data_shape = get_const_tuple(data.checked_type.shape)
    weight_shape = get_const_tuple(weight.checked_type.shape)
    _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape)
    batch, in_channel, in_h, in_w = data_shape
    out_channel, _, filter_h, filter_w = weight_shape

    # infer output_padding
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
        get_const_tuple(attrs.padding), (filter_h, filter_w))
    stride_h, stride_w = get_const_tuple(attrs.strides)
    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
    out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
    out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w
    output_padding = (in_h - out_h, in_w - out_w)

    assert attrs.data_layout == "NCHW", "only support NCHW data layout"
    assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout"
    assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout"

    backward_data = _nn.conv2d_transpose(
        grad,
        weight,
        strides=attrs.strides,
        padding=attrs.padding,
        dilation=attrs.dilation,
        groups=attrs.groups,
        output_padding=output_padding,
    )
    grad = tile(grad, [1, in_channel // attrs.groups, 1, 1])
    grad = reshape(grad, [-1, 1, 0, 0])  # batch * oc * ic // groups, 1, oh, ow
    data = reshape(data, [1, -1, 0, 0])  # 1, batch * ic, ih, iw

    backward_weight = _nn.conv2d(
        data,
        grad,
        strides=attrs.dilation,
        padding=attrs.padding,
        dilation=attrs.strides,
        groups=in_channel * batch,
    )
    # infer shape of backward_weight
    padded_weight_grad_h = (in_h - (grad_h - 1) * stride_h - 1 + fpad_top +
                            fpad_bottom) // dilation_h + 1
    padded_weight_grad_w = (in_w - (grad_w - 1) * stride_w - 1 + fpad_left +
                            fpad_right) // dilation_w + 1
    backward_weight = reshape(
        backward_weight,
        [
            batch,
            in_channel // attrs.groups,
            out_channel,
            padded_weight_grad_h,
            padded_weight_grad_w,
        ],
    )
    backward_weight = _sum(backward_weight, axis=0)
    backward_weight = transpose(backward_weight, [1, 0, 2, 3])

    assert padded_weight_grad_h >= filter_h
    assert padded_weight_grad_w >= filter_w
    if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w:
        backward_weight = strided_slice(
            backward_weight,
            begin=[0, 0, 0, 0],
            end=[out_channel, in_channel // attrs.groups, filter_h, filter_w],
        )

    return [backward_data, backward_weight]
def verify_conv2d_nchw(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
    devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"],
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
    bias = te.placeholder((num_filter, 1, 1), name="bias")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding)
        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                device, _conv2d_nchw_winograd_implement)
            C = fcompute(A, W, stride, padding, dilation, dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         ctx)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)

    for device in devices:
        check_device(device)
def verify_conv2d_NCHWc(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
    dtype="float32",
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    in_height = in_width = in_size
    print(
        "Workload: (%d, %d, %d, %d, %d, %d, %d)" %
        (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum))

    # for testing functionality,
    # we choose arbitrary block size that can divide the channel,
    # regardless of the performance.
    oc_block = 1
    for bn in range(16, 0, -1):
        if num_filter % bn == 0:
            oc_block = bn
            break

    ic_block = 1
    for bn in range(oc_block, 0, -1):
        if in_channel % bn == 0:
            ic_block = bn
            break

    A = te.placeholder(
        (batch, in_channel // ic_block, in_height, in_width, ic_block),
        name="A")
    W = te.placeholder(
        (num_filter // oc_block, in_channel // ic_block, kernel, kernel,
         ic_block, oc_block),
        name="W",
    )
    bias = te.placeholder((num_filter // oc_block, 1, 1, oc_block),
                          name="bias")

    @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc")
    def get_ref_data():
        a_np = np.random.uniform(size=(batch, in_channel, in_height,
                                       in_width)).astype(dtype)
        w_np = np.random.uniform(size=(num_filter, in_channel, kernel,
                                       kernel)).astype(dtype)
        b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding)
        if add_bias:
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return (
            _transform_data(a_np, ic_block),
            _transform_kernel(w_np, ic_block, oc_block),
            _transform_bias(b_np, oc_block),
            _transform_data(c_np, oc_block),
        )

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            C = topi.x86.conv2d_NCHWc(
                A,
                W,
                (stride, stride),
                padding,
                (dilation, dilation),
                "NCHW%dc" % ic_block,
                "NCHW%dc" % oc_block,
                dtype,
            )
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.x86.schedule_conv2d_NCHWc([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            func = tvm.build(
                s,
                [A, W, bias, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                device,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)

    # test llvm only for now since conv2d_NCHWc implement is missing in other backend.
    for device in ["llvm"]:
        with autotvm.tophub.context(
                device):  # load tophub pre-tuned parameters
            check_device(device)
def depthwise_conv2d_with_workload_NCHWc(batch,
                                         in_channel,
                                         in_height,
                                         channel_multiplier,
                                         filter_height,
                                         stride,
                                         padding,
                                         dilation=1):
    in_width = in_height
    filter_channel = in_channel
    filter_width = filter_height
    stride_h = stride_w = stride

    assert (
        channel_multiplier == 1
    ), "depthwise_conv2d_NCHWc currently does not support channel multiplier > 1."
    pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
    padding_args = (pad_h, pad_w)

    out_channel = filter_channel * channel_multiplier
    # for testing functionality,
    # we choose arbitrary block size that can divide the channel,
    # regardless of the performance.
    oc_block = 1
    for bn in range(16, 0, -1):
        if out_channel % bn == 0:
            oc_block = bn
            break

    ic_block = 1
    for bn in range(oc_block, 0, -1):
        if in_channel % bn == 0:
            ic_block = bn
            break

    # placeholder
    Input = te.placeholder(
        (batch, in_channel // ic_block, in_height, in_width, ic_block),
        name="Input")
    Filter = te.placeholder(
        (out_channel // oc_block, 1, filter_height, filter_width, 1, oc_block),
        name="Filter")
    in_layout = "NCHW%dc" % ic_block
    out_layout = "NCHW%dc" % oc_block
    dtype = "float32"

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            # declare
            DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc(
                Input,
                Filter,
                (stride_h, stride_w),
                padding,
                (dilation, dilation),
                in_layout,
                out_layout,
                dtype,
            )
            # TODO: add scale_shift implement for NCHWc and add test here
            Relu = topi.nn.relu(DepthwiseConv2d)
            # schedule
            s1 = topi.x86.schedule_depthwise_conv2d_NCHWc(DepthwiseConv2d)
            s2 = topi.x86.schedule_depthwise_conv2d_NCHWc(Relu)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Relu], device)

        # Prepare pod type for test data closure
        input_shape = (batch, in_channel, in_height, in_width)
        filter_shape = (filter_channel, channel_multiplier, filter_height,
                        filter_width)

        # Use memoize, pickle the test data for next time use.
        @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc")
        def get_ref_data():
            input_np = np.random.uniform(size=input_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            # correctness with scipy
            dw_np = tvm.topi.testing.dilate_python(
                filter_np, (1, 1, dilation, dilation)).astype(dtype)
            depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nchw(
                input_np, dw_np, stride, padding)
            relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
            return (
                _transform_data(input_np, ic_block),
                _transform_kernel(filter_np, oc_block),
                _transform_data(depthwise_conv2d_scipy, oc_block),
                _transform_data(relu_scipy, oc_block),
            )

        # Get the test data
        (input_np, filter_np, depthwise_conv2d_scipy,
         relu_scipy) = get_ref_data()

        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)

        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
        # launch kernel 2 (depthwise_conv2d + relu)
        f2(input_tvm, filter_tvm, relu_tvm)
        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                    depthwise_conv2d_scipy,
                                    rtol=1e-5)
        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

    # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
    for device in ["llvm"]:
        with autotvm.tophub.context(
                device):  # load tophub pre-tuned parameters
            check_device(device)