Example #1
0
def group_conv3d_nchw(Input,
                      Filter,
                      stride,
                      padding,
                      dilation,
                      groups,
                      out_dtype=None):
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 3
    assert isinstance(dilation, int) or len(dilation) == 3
    if isinstance(stride, int):
        stride_z = stride_h = stride_w = stride
    else:
        stride_z, stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_z = dilation_h = dilation_w = dilation
    else:
        dilation_z, dilation_h, dilation_w = dilation

    batch, in_channel, in_z, in_height, in_width = get_const_tuple(Input.shape)
    num_filter, _, kernel_z, kernel_h, kernel_w = get_const_tuple(Filter.shape)

    assert in_channel % groups == 0, "input channels must divide group size"
    assert num_filter % groups == 0, "output channels must divide group size"

    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d(
        padding, (kernel_z, kernel_h, kernel_w))

    # compute the output shape
    out_channel = num_filter
    out_z = simplify(
        (in_z -
         (kernel_z - 1) * dilation_z - 1 + pad_front + pad_back) // stride_z +
        1)
    out_height = simplify(
        (in_height -
         (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width -
         (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w +
        1)
    # compute graph
    pad_before = [0, 0, pad_front, pad_top, pad_left]
    pad_after = [0, 0, pad_back, pad_down, pad_right]
    temp = pad(Input, pad_before, pad_after, name="pad_temp")
    rc = tvm.reduce_axis((0, in_channel // groups), name='rc')
    rz = tvm.reduce_axis((0, kernel_z), name='rz')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
    return tvm.compute(
        (batch, out_channel, out_z, out_height, out_width),
        lambda nn, ff, zz, yy, xx: tvm.sum(temp[
            nn, ff // (num_filter // groups) *
            (in_channel // groups) + rc, zz * stride_z + rz * dilation_z, yy *
            stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w
        ].astype(out_dtype) * Filter[ff, rc, rz, ry, rx].astype(out_dtype),
                                           axis=[rc, rz, ry, rx]),
        tag='group_conv3d_nchw')
Example #2
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    print("Run in pure nChwc common decl")
    assert layout == 'NCHW', "only support NCHW convolution for AVX"
    wkl = get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    num_filter, _, kernel_height, kernel_width, _, co = get_const_tuple(
        kernel.shape)
    num_filter *= co

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # pack data
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width,
             sch.ic_bn)
    data_vec = tvm.compute(
        shape,
        lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w],
        name='data_vec')

    kernel_vec = kernel

    # convolution
    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)
    unpack_shape = (batch_size, num_filter, out_height, out_width)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(
            data_vec[n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic %
                     sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw,
                                             ic % sch.ic_bn, oc_block],
            axis=[ic, kh, kw]),
        name='conv')

    unpack = tvm.compute(
        unpack_shape,
        lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn],
        name='output_unpack',
        tag='conv2d_nchw')
    return unpack
Example #3
0
def _declaration_conv(wkl, data, kernel):
    sch = _get_schedule(wkl)

    out_dtype = wkl.out_dtype
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size = data.shape[0]
    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1

    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
    else:
        data_pad = data

    oshape = (batch_size, wkl.out_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)
    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[
            n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype(
                out_dtype) * kernel[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn,
                                    oc_block, 0, 0],
                                                      axis=[ic]),
        name='conv2d_NCHWc',
        tag='conv2d_NCHWc')

    return conv
Example #4
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    assert layout == 'NCHW', "only support NCHW convolution on rasp"
    assert data.shape[
        0].value == 1, "only support batch size=1 convolution on rasp"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # input: c, h, w
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data
    shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width,
             sch.ic_bn)
    data_vec = tvm.compute(
        shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w])

    shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn,
             sch.oc_bn, 1, 1)
    kernel_pack = tvm.compute(
        shape, lambda CO, CI, ci, co, h, w: kernel[CO * sch.oc_bn + co, CI *
                                                   sch.ic_bn + ci, h, w])

    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
            n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype(
                out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, ic % sch.
                                         ic_bn, oc_block, 0, 0],
                                                      axis=[ic]),
        name='conv')

    oshape = (batch_size, num_filter, out_height, out_width)
    unpack = tvm.compute(
        oshape,
        lambda n, oc, oh, ow: conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn],
        tag='conv2d_nchw')
    return unpack
Example #5
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    assert layout == 'NCHWc', "only support NCHW convolution on rasp"
    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
    wkl = get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
    num_filter, _, _, co, kernel_height, kernel_width = get_const_tuple(kernel.shape)
    num_filter *= co

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # input: c, h, w
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
    else:
        data_pad = data

    in_channel = in_channel_block * in_channel_chunk
    if in_channel_block != sch.ic_bn:
        print('WARNING!!! (1x1) in_channel_block=%d vs sch.ic_bn=%d' % (in_channel_block, sch.ic_bn))
        shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn)
        data_vec = tvm.compute(shape, lambda n, C, h, w, c:
            data_pad[n, (C * sch.ic_bn + c) // in_channel_block, h, w, (C * sch.ic_bn + c) % in_channel_block],
                               tag='conv2d_data_pack')
    else:
        data_vec = data_pad

    kernel_pack = kernel

    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn)
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
        tvm.sum(data_vec[n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype(out_dtype) *
                kernel_pack[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0],
                axis=[ic]), name='conv2d_nChwc', tag='conv2d_nChwc')

    return conv
Example #6
0
def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
    """Convolution operator in NHWC layout.
    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]
    Filter : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, num_filter]
    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]
    padding : int or str
        Padding size, or ['VALID', 'SAME']
    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_height,  out_width, out_channel]
    """
    assert isinstance(stride, int) or len(stride) == 2
    batch, in_height, in_width, in_channel = Input.shape
    kernel_h, kernel_w, channel, num_filter = Filter.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    # compute the output shape
    out_channel = num_filter
    pad_before = [0, 0, 0, 0]
    pad_after = [0, 0, 0, 0]
    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
    _, a, b, _ = PaddedInput.shape
    out_height = a
    out_width = b
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    Output = tvm.compute(
        (batch, out_height, out_width, out_channel),
        lambda nn, yy, xx, ff: tvm.sum(PaddedInput[nn, yy, xx, rc].astype(
            out_dtype) * Filter[0, 0, rc, ff].astype(out_dtype),
                                       axis=[rc]),
        name="Conv2dOutput",
        tag="conv2d_nhwc")
    return Output
Example #7
0
def conv2d(N, H, W, CI, CO, KH, KW, strides, padding, scaling_factor):
    dilation = 2
    cfg = autotvm.get_config()

    data = tvm.placeholder((N, CI / BI, H, W, BI), name='data', dtype='int8')
    kernel = tvm.placeholder((CO / BO, CI / BI, KH, KW, BO, BI),
                             name='kernel',
                             dtype='int8')

    pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
    stride_h, stride_w = (strides,
                          strides) if isinstance(strides, int) else strides

    pad_height = H + 2 * pad_h
    pad_width = W + 2 * pad_w

    out_height = (pad_height - ((KH - 1) * dilation + 1)) // stride_h + 1
    out_width = (pad_width - ((KW - 1) * dilation + 1)) // stride_w + 1

    DOPAD = (stride_h != 0 or stride_w != 0)
    if DOPAD:
        pad_data = pad(data, (0, 0, pad_h, pad_w, 0), name='pad_data')
    else:
        pad_data = data

    oshape = (N, CO / BO, out_height, out_width, BO)

    ic_chunk = tvm.reduce_axis((0, CI / BI), name='ic_chunk')
    ic_block = tvm.reduce_axis((0, BI), name='ic_block')
    kh = tvm.reduce_axis((0, KH), name='kh')
    kw = tvm.reduce_axis((0, KW), name='kw')

    conv = tvm.compute(
        oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.
        sum(pad_data[n, ic_chunk, oh * stride_h + kh * dilation, ow * stride_w
                     + kw * dilation, ic_block].astype('int32') *
            kernel[oc_chunk, ic_chunk, kh, kw, oc_block, ic_block].astype(
                'int32'),
            axis=[ic_chunk, kh, kw, ic_block]))

    output = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block:
        (conv[n, oc_chunk, oh, ow, oc_block] * scaling_factor).astype('int8'),
        name='conv')

    s = tvm.create_schedule([output.op])
    s[conv].set_scope('local')

    # inline padding
    if DOPAD:
        s[pad_data].compute_inline()

    data, raw_data = pad_data, data

    # create cache stage
    AA = s.cache_read(data, 'shared', [conv])
    WW = s.cache_read(kernel, 'shared', [conv])

    # tile and bind spatial axes
    n, f, y, x, c = s[output].op.axis
    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)

    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)

    # this is the scope to attach global config inside this kernel
    kernel_scope, n = s[output].split(n, nparts=1)

    s[output].bind(n, tvm.thread_axis("blockIdx.z"))
    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[output].bind(vf, tvm.thread_axis("vthread"))
    s[output].bind(vy, tvm.thread_axis("vthread"))
    s[output].bind(vx, tvm.thread_axis("vthread"))
    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
    _, c = s[output].split(c, factor=4)
    #s[output].vectorize(c)

    s[conv].compute_at(s[output], tx)

    # tile and bind reduction axes
    n, f, y, x, c = s[conv].op.axis

    rc, ry, rx, rc_block = s[conv].op.reduce_axis
    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
    rco, rci = cfg['tile_rc'].apply(s, conv, rc)
    ryo, ryi = cfg['tile_ry'].apply(s, conv, ry)
    rxo, rxi = cfg['tile_rx'].apply(s, conv, rx)

    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)

    _, rc_block = s[conv].split(rc_block, factor=4)
    s[conv].tensorize(rc_block, dot)

    s[AA].compute_at(s[conv], n)
    s[WW].compute_at(s[conv], rxo)

    # cooperative fetching
    for load in [AA, WW]:
        if load == AA:
            n, f, y, x, c = s[load].op.axis
            if not DOPAD:
                s[load].vectorize(c)
                fused = s[load].fuse(n, f, y, x)
            else:
                c, _ = s[load].split(c, factor=4)
                fused = s[load].fuse(n, f, y, x, c)
        else:
            n, f, y, x, oc_chunk, c = s[load].op.axis
            fused = s[load].fuse(n, f, y, x, oc_chunk)
            s[load].vectorize(c)

        fused, tx = s[load].split(fused, factor=cfg["tile_x"].size[2])
        fused, ty = s[load].split(fused, factor=cfg["tile_y"].size[2])
        fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2])
        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

    for load in [AA, WW]:
        name = load.op.name + '_double_buffer'
        cfg.define_knob(name, [0, 1])

        if cfg[name].val:
            s[load].double_buffer

    # tune unroll
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                     cfg['auto_unroll_max_step'].val)
    s[output].pragma(kernel_scope, 'unroll_explicit', False)

    # num flop
    NH, NW = [e.value for e in output.shape[2:4]]
    cfg.add_flop(N * CO * NH * NW * (CI * KH * KW * 2))
    return s, [raw_data, kernel, output]
Example #8
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    # print('Run in avx512_conv_common decl')
    assert layout == 'NCHW', "only support NCHW convolution on rasp"
    assert data.shape[
        0].value == 1, "only support batch size=1 convolution on rasp"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    if len(kernel.shape) == 4:
        num_filter, _, kernel_height, kernel_width = get_const_tuple(
            kernel.shape)
    else:
        num_filter, _, kernel_height, kernel_width, ic, oc = get_const_tuple(
            kernel.shape)
        num_filter *= oc

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # pack data
    # input: c, h, w
    shape = (batch_size, in_channel, pad_height, pad_width)
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data
    # data_pad = tvm.compute(shape, lambda n, c, h, w: tvm.select(
    #     tvm.all(h >= HPAD, h < pad_height - HPAD, w >= WPAD, w < pad_width - WPAD),
    #     data[n, c, h - HPAD, w - WPAD], 0.0
    # ), name='data_pad')

    shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn,
             pad_width)
    data_vec = tvm.compute(
        shape,
        lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w],
        name='data_vec')

    # pack kernel
    # input: co, ci, h, w
    # output: gOIhw16i16o
    if False:
        shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn,
                 kernel_height, kernel_width, sch.ic_bn, sch.oc_bn)
        kernel_pack = tvm.compute(
            shape,
            lambda CO, CI, h, w, ci, co: kernel[CO * sch.oc_bn + co, CI * sch.
                                                ic_bn + ci, h, w],
            name='kernel_pack')
    else:
        kernel_pack = kernel

    # convolution
    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)
    ovshape = (batch_size, num_filter // sch.oc_bn, out_height, sch.oc_bn,
               out_width)
    unpack_shape = (batch_size, num_filter, out_height, out_width)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
            n, ic // sch.ic_bn, oh * HSTR + kh, ic % sch.ic_bn, ow * WSTR + kw
        ].astype(out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, kh, kw, ic
                                          % sch.ic_bn, oc_block],
                                                      axis=[ic, kh, kw]),
        name='conv')

    unpack = tvm.compute(
        unpack_shape,
        lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn],
        name='output_unpack',
        tag='conv2d_nchw')
    return unpack
Example #9
0
def _declaration_conv(wkl, data, kernel):
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    ndim_input = len(data.shape)

    if ndim_input == 5:
        batch_size, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(
            data.shape)
        in_channel = in_channel_block * in_channel_chunk
    else:
        assert ndim_input == 4
        in_channel_block = 0
        batch_size, in_channel, in_height, in_width = get_const_tuple(
            data.shape)

    num_filter, _, kernel_height, kernel_width, _, co = get_const_tuple(
        kernel.shape)
    num_filter *= co

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # pack data
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        if ndim_input == 5:
            data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
        else:
            assert ndim_input == 4
            data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    if in_channel_block != sch.ic_bn:
        print('WARNING!!! (common) in_channel_block=%d vs sch.ic_bn=%d' %
              (in_channel_block, sch.ic_bn))
        shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width,
                 sch.ic_bn)
        if ndim_input == 5:
            data_vec = tvm.compute(
                shape,
                lambda n, C, h, w, c: data_pad[
                    n, (C * sch.ic_bn + c) // in_channel_block, h, w,
                    (C * sch.ic_bn + c) % in_channel_block],
                name='data_vec',
                tag="conv2d_data_pack")
        else:
            assert ndim_input == 4
            data_vec = tvm.compute(
                shape,
                lambda n, C, h, w, c: data_pad[n, (C * sch.ic_bn + c), h, w],
                name='data_vec',
                tag="conv2d_data_pack")
            # data_pad = data_vec
    else:
        data_vec = data_pad

    kernel_vec = kernel

    # convolution
    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    import re
    unpack_channel_block = re.findall(r'\d+', sch.layout_out)
    if len(unpack_channel_block) == 0:
        conv = tvm.compute(
            oshape,
            lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
                n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % sch.
                ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch
                                    .ic_bn, oc_block],
                                                          axis=[ic, kh, kw]),
            name='conv2d')  # , tag="conv2d_nChwc")
        unpack_shape = (batch_size, num_filter, out_height, out_width)
        unpack = tvm.compute(
            unpack_shape,
            lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn],
            name='output_unpack',
            tag='conv2d_nChwc_unpack')
    else:
        assert len(unpack_channel_block) == 1
        unpack_channel_block = int(unpack_channel_block[0])
        if unpack_channel_block == sch.oc_bn:
            return tvm.compute(
                oshape,
                lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
                    n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic %
                    sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw,
                                            ic % sch.ic_bn, oc_block],
                                                              axis=
                                                              [ic, kh, kw]),
                name='conv2d',
                tag="conv2d_nChwc")
        else:
            conv = tvm.compute(
                oshape,
                lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
                    n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic %
                    sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw,
                                            ic % sch.ic_bn, oc_block],
                                                              axis=
                                                              [ic, kh, kw]),
                name='conv2d')
            unpack_shape = (batch_size, num_filter // unpack_channel_block,
                            out_height, out_width, unpack_channel_block)
            unpack = tvm.compute(
                unpack_shape,
                lambda n, C, h, w, c: conv[
                    n, (C * unpack_channel_block + c) // sch.oc_bn, h, w,
                    (C * unpack_channel_block + c) % sch.oc_bn],
                name='output_unpack',
                tag='conv2d_nChwc_unpack')

    return unpack
Example #10
0
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
    assert isinstance(strides, int) or len(strides) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(strides, int):
        stride_h = stride_w = strides
    else:
        stride_h, stride_w = strides

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch_size, in_height, in_width, in_channels = data.shape
    kernel_h, kernel_w, out_channels, _ = kernel.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name='padded_data')

    rc = te.reduce_axis((0, in_channels), name='rc')
    ry = te.reduce_axis((0, kernel_h), name='ry')
    rx = te.reduce_axis((0, kernel_w), name='rx')

    conv = te.compute(
        (batch_size, out_height, out_width, out_channels),
        lambda nn, yy, xx, ff: te.sum(
            padded_data[nn, yy * stride_h + ry * dilation_h,
                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
            kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]),
        name='conv2d', tag='conv2d_nhwc')

    ###########################
    # Config Space Definition #
    ###########################
    n, oh, ow, co = (cfg.axis(batch_size.value),
                     cfg.axis(out_height.value),
                     cfg.axis(out_width.value),
                     cfg.axis(out_channels.value))
    kh, kw, ci = (cfg.reduce_axis(kernel_h.value),
                  cfg.reduce_axis(kernel_w.value),
                  cfg.reduce_axis(in_channels.value))

    assert in_channels.value % 4 == 0
    owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2)
    cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2,
                                filter=lambda x: x.size[-1] % 4 == 0)
    coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2)

    cfg.define_reorder('reorder_0_simd',
                       [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
                       policy='candidate', candidate=[
                           [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
                           [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
                           [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
                           [n, kh, kw, oh, coo, owo, cio, owi, coi, cii]])

    cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
    cfg.define_knob('unroll_explicit', [0, 1])

    return conv
Example #11
0
def depth_1by1_fused(Input,
                     Filter_d,
                     Filter_1,
                     stride_d,
                     padding_d='SAME',
                     dilation_d=1,
                     out_dtype=None,
                     layout="NCHW"):
    """Fused depthwise convolution + 1x1 convolution forward operator (NCHW & NHWC).

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width] (NCHW)
                    or [batch, in_height, in_width, in_channel] (NHWC)

    Filter_d : tvm.Tensor
        4-D with shape [in_channel, in_channel * channel_multiplier, filter_height, filter_width]
                    or [filter_height, filter_width, in_channel, in_channel * channel_multiplier]

    Filter_1 : tvm.Tensor
        4-D with shape [out_channel, in_channel * channel_multiplier, 0, 0]
                    or [0, 0, out_channel, in_channel * channel_multiplier]

    stride_d : tuple of two ints
        The spatial stride along height and width

    padding_d : int or str
        Padding size, or ['VALID', 'SAME']

    dilation_d: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype: str, optional
        Output data type

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]
    """

    assert layout in ["NCHW", "NHWC"]

    out_dtype = Input.dtype if out_dtype is None else out_dtype

    if isinstance(stride_d, int):
        stride_h_d = stride_w_d = stride_d
    else:
        stride_h_d, stride_w_d = stride_d

    if isinstance(dilation_d, int):
        dilation_h_d = dilation_w_d = dilation_d
    else:
        dilation_h_d, dilation_w_d = dilation_d

    if layout == "NCHW":
        if dilation_h_d != 1 or dilation_w_d != 1:
            Filter_d = dilate(Filter_d, (1, 1, dilation_h_d, dilation_w_d))
        batch, in_channel_d, in_height_d, in_width_d = Input.shape
        filter_channel, _, filter_height, filter_width = Filter_d.shape
        num_filter, channel, _, _ = Filter_1.shape
    else:  # NHWC
        if dilation_h_d != 1 or dilation_w_d != 1:
            Filter_d = dilate(Filter_d, (dilation_h_d, dilation_w_d, 1, 1))
        batch, in_height_d, in_width_d, in_channel_d = Input.shape
        filter_height, filter_width, filter_channel, _ = Filter_d.shape
        _, _, num_filter, channel = Filter_1.shape

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding_d, (filter_height, filter_width))
    out_channel = simplify(in_channel_d)
    out_height = simplify((in_height_d - filter_height + pad_top + pad_down) //
                          stride_h_d + 1)
    out_width = simplify((in_width_d - filter_width + pad_left + pad_right) //
                         stride_w_d + 1)
    out_channel = num_filter

    # padding stage
    if layout == "NCHW":
        pad_before = [0, 0, pad_top, pad_left]
        pad_after = [0, 0, pad_down, pad_right]
    else:  # NHWC
        pad_before = [0, pad_top, pad_left, 0]
        pad_after = [0, pad_down, pad_right, 0]

    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")

    # depthconv stage
    di = tvm.reduce_axis((0, filter_height), name='di')
    dj = tvm.reduce_axis((0, filter_width), name='dj')
    # 1by1 stage
    c = tvm.reduce_axis((0, out_channel), name='c')

    if layout == "NCHW":
        Output = tvm.compute(
            (batch, out_channel, out_height, out_width),
            lambda b, f, i, j: tvm.sum(
                (PaddedInput[b, c, i * stride_h_d + di, j * stride_w_d + dj].
                 astype(out_dtype) * Filter_d[c, 0, di, dj].astype(
                     out_dtype) * Filter_1[f, c, 0, 0].astype(out_dtype)),
                axis=[di, dj, c]),
            name='Depthwise1by1Fused',
            tag="depthwise_1by1_fused_nchw")
    else:  # NHWC
        Output = tvm.compute(
            (batch, out_height, out_width, out_channel),
            lambda b, i, j, f: tvm.sum(
                (PaddedInput[b, i * stride_h_d + di, j * stride_w_d + dj, c].
                 astype(out_dtype) * Filter_d[di, dj, c, 0].astype(
                     out_dtype) * Filter_1[0, 0, c, f].astype(out_dtype)),
                axis=[di, dj, c]),
            name='Depthwise1by1Fused',
            tag="depthwise_1by1_fused_nhwc")
    return Output
Example #12
0
def fused_convs(input_data, filters, resnet_block=False):

	out_dtype = input_data.dtype

	Input = None
	nodes = [input_data]
	params = [input_data]

	for f in filters:
		Input = nodes[-1]
		Filter = f.placeholder
		layout = f.layout
		depthwise = f.depthwise
		kernel = f.kernel
		stride = f.stride
		padding = f.padding
		dilation = f.dilation

		assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise

		padded_count = 0
		conv_count = 0
		depthwise_count = 0

		if isinstance(stride, int):
			stride_h = stride_w = stride
		else:
			stride_h, stride_w = stride

		if isinstance(dilation, int):
			dilation_h = dilation_w = dilation
		else:
			dilation_h, dilation_w = dilation

		batch, in_height, in_width, in_channel = Input.shape
		if f.NHWC_transpose: # HWOI
			kernel_h, kernel_w, tmp, kernel_channel = Filter.shape
		else: # HWIO
			kernel_h, kernel_w, kernel_channel, tmp = Filter.shape
		if depthwise:
			channel_multiplier = tmp
		else:
			num_filter = tmp

		# compute the output shape
		dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
		dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
		pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
			padding, (dilated_kernel_h, dilated_kernel_w))

		out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter
		out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
		out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

		if f.kernel > 1:
			print("Padding is needed!")

			pad_before = [0, pad_top, pad_left, 0]
			pad_after = [0, pad_down, pad_right, 0]

			PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count))
			padded_count += 1
			nodes.append(PaddedInput)

			# Update Input
			Input = PaddedInput
			batch, in_height, in_width, in_channel = Input.shape

		if not depthwise:
			rc = tvm.reduce_axis((0, in_channel), name='rc')
		if kernel > 1:
			ry = tvm.reduce_axis((0, kernel_h), name='ry')
			rx = tvm.reduce_axis((0, kernel_w), name='rx')

		if not depthwise: # Normal convolution
			if kernel > 1:
				Output = tvm.compute(
				(batch, out_height, out_width, out_channel),
				lambda nn, yy, xx, ff: tvm.sum(
					Input[nn, yy * stride_h + ry * dilation_h,
								xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
					(Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]),
					name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc")
			else: # Only reduce rc axis
				Output = tvm.compute(
				(batch, out_height, out_width, out_channel),
				lambda nn, yy, xx, ff: tvm.sum(
					Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) *
					(Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]),
					name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc")
			conv_count += 1
		else: # Depthwise convolution (kernel > 1)
			Output = tvm.compute(
			(batch, out_height, out_width, out_channel),
			lambda b, i, j, c: tvm.sum(
				(Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w,
							 tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) *
				(Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)),
				axis=[ry, rx]),
			name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc")
			depthwise_count += 1

		nodes.append(Output)
		params.append(Filter)

	if resnet_block:
		First = nodes[0]
		Last = nodes[-1]
		assert (first.shape == last.shape)
		Output = tvm.compute(
			(batch, out_height, out_width, out_channel),
			lambda b, i, j, c: tvm.sum(
				(First[b, i, j, c].astype(out_dtype) + 
				(Last[b, i, j, c]).astype(out_dtype))),
			name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc")
		nodes.append(Output)

	params.append(nodes[-1]) # Final output
	return nodes, params
Example #13
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    assert layout == 'NCHWc', "only support NCHWc convolution for AVX"
    wkl = get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(
        data.shape)
    num_filter, _, kernel_height, kernel_width, _, co = get_const_tuple(
        kernel.shape)
    num_filter *= co

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # pack data
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
    else:
        data_pad = data

    in_channel = in_channel_block * in_channel_chunk
    if in_channel_block != sch.ic_bn:
        print('WARNING!!! (common) in_channel_block=%d vs sch.ic_bn=%d' %
              (in_channel_block, sch.ic_bn))
        shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width,
                 sch.ic_bn)
        data_vec = tvm.compute(
            shape,
            lambda n, C, h, w, c: data_pad[
                n, (C * sch.ic_bn + c) // in_channel_block, h, w,
                (C * sch.ic_bn + c) % in_channel_block],
            name='data_vec',
            tag="conv2d_data_pack")
    else:
        data_vec = data_pad

    kernel_vec = kernel

    # convolution
    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(
            data_vec[n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic %
                     sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw,
                                             ic % sch.ic_bn, oc_block],
            axis=[ic, kh, kw]),
        name='conv2d_nChwc',
        tag="conv2d_nChwc")

    return conv