Beispiel #1
0
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
    if out_dtype is None:
        out_dtype = data.dtype
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)
    # create tuning space
    cfg.define_split("tile_x", out_dim, num_outputs=2)
    cfg.define_split("tile_y", batch, num_outputs=2)
    cfg.define_split("tile_k", in_dim, num_outputs=2)
    if cfg.is_fallback:
        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)

    vec = cfg["tile_k"].size[-1]
    k = tvm.reduce_axis((0, in_dim // vec), "k")
    CC = tvm.compute((batch, out_dim, vec),
                     lambda z, y, x: tvm.sum(
                         data[z, k * vec + x].astype(out_dtype) *
                         weight[y, k * vec + x].astype(out_dtype), axis=k))

    kk = tvm.reduce_axis((0, vec), "kk")
    C = tvm.compute((batch, out_dim),
                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
                    tag="dense_nopack")
    if bias is not None:
        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)

    return C
Beispiel #2
0
def packed_conv2d(data,
                  kernel,
                  padding,
                  strides,
                  out_dtype="int32"):
    """ Packed conv2d function.
    """
    if padding[0]:
        pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
    else:
        pad_data = data
    assert len(data.shape) == 6
    assert len(kernel.shape) == 6
    oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
    owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
    oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])

    ishape = topi.util.get_const_tuple(data.shape)
    kshape = topi.util.get_const_tuple(kernel.shape)
    assert data.dtype == "int8", data.dtype
    assert kernel.dtype == "int8", kernel.dtype
    d_i = tvm.reduce_axis((0, kshape[2]), name='d_i')
    d_j = tvm.reduce_axis((0, kshape[3]), name='d_j')
    k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
    k_i = tvm.reduce_axis((0, ishape[-1]), name='k_i')
    hstride, wstride = strides
    res = tvm.compute(
        oshape,
        lambda b_o, c_o, i, j, b_i, c_i: tvm.sum(
            pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) *
            kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
            axis=[k_o, d_i, d_j, k_i]),
        name="res", tag="packed_conv2d")
    return res
Beispiel #3
0
def test_conv_tiling():
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    stmt = tvm.ir_pass.LoopPartition(stmt, True)
    stmt = tvm.ir_pass.Simplify(stmt)
    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
Beispiel #4
0
    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype('int32')
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
        roi_start_h *= spatial_scale
        roi_end_h *= spatial_scale
        roi_start_w *= spatial_scale
        roi_end_w *= spatial_scale

        # force malformed ROIs to be 1x1
        roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype))
        roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype))

        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        if sample_ratio > 0:
            roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32')
        else:
            roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32')
            roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32')

        count = roi_bin_grid_h * roi_bin_grid_w
        rh = tvm.reduce_axis((0, roi_bin_grid_h))
        rw = tvm.reduce_axis((0, roi_bin_grid_w))
        roi_start_h += ph * bin_h
        roi_start_w += pw * bin_w
        return tvm.sum(_bilinear(batch_index, c,
                                 roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                                 roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count,
                       axis=[rh, rw])
Beispiel #5
0
def test_lstm_cell_inline():
    num_step = 128
    num_input = 256
    num_hidden = 1152
    batch_size = 4
    # Global transition matrix
    X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
    Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
    s_init_c = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_c")
    s_init_h = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_h")
    # LSTM transition
    k = tvm.reduce_axis((0, num_input), name="ki2h")
    s_i2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
        name="s_i2h")
    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
    s_h2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
        name="s_h2h")
    # Gate rules
    gates = tvm.compute(s_i2h.shape, lambda *i:
                        s_i2h(*i) + s_h2h(*i), name="gates")
    gshape = (num_step, batch_size, num_hidden)
    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
    next_c = tvm.compute(gshape,
                         lambda t, i, j:
                         forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                         in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
    next_h = tvm.compute(gshape,
                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
    # schedule
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h, update_c],
        [s_state_h, s_state_c],
        inputs=[X],
        name="lstm_scan")
    # schedule
    s = tvm.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()
    # verify we can lower correctly
    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
Beispiel #6
0
def test_rfactor():
    n = tvm.var('n')
    k1 = tvm.reduce_axis((0, n), name="k1")
    k2 = tvm.reduce_axis((0, n), name="k2")
    A = tvm.placeholder((n, n, n), name='A')
    B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2]))
    # normal schedule
    s = tvm.create_schedule(B.op)
    BF = s.rfactor(B, k1)
    assert(tuple(BF.shape) == (n, n))
    assert(set(BF.op.body[0].axis) == set([k2]))
    assert(s[B].op.body[0].axis[0].dom.extent == n)
    assert(len(s[B].all_iter_vars) == 2)
    # schedule with splot
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(k1, factor=4)
    xo, xi = s[B].split(B.op.axis[0], factor=8)
    BF = s.rfactor(B, ki)
    assert(BF.shape[0].value == 4)
    assert(BF.shape[1] == n)
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
    # schedule with factor_axis
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(k1, factor=4)
    xo, xi = s[B].split(B.op.axis[0], factor=8)
    BF = s.rfactor(B, ki, 1)
    assert(n == BF.shape[0])
    assert(BF.shape[1].value == 4)
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
Beispiel #7
0
def test_in_bounds_conv_llvm(loop_tiling=False):
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    if loop_tiling:
        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
    print (lowered_func.body)
    ctx = tvm.cpu (0)

    f = tvm.build(s, [data, kernel, conv], "llvm")
    data_input = tvm.nd.array(np.random.uniform(
          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
    kernel_input = tvm.nd.array(np.random.uniform(
          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
    f(data_input, kernel_input, conv_out)
Beispiel #8
0
def global_pool(data, pool_type):
    """Perform global pooling on the data

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, channel, in_height, in_width]

    pool_type : str
        Pool type, 'max' or 'avg'

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, channel, 1, 1]
    """
    assert len(data.shape) == 4, "only support 4-dim pooling"
    batch, channel, height, width = data.shape

    dheight = tvm.reduce_axis((0, height))
    dwidth = tvm.reduce_axis((0, width))

    if pool_type == 'max':
        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
                            tag="global_pool_max")
    elif pool_type == 'avg':
        tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
                            tag="global_pool_sum")
        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \
                            tag=tag.ELEMWISE)
    else:
        raise ValueError("Pool type should be 'avg' or 'max'.")
Beispiel #9
0
def _spatial_pack(data, kernel, stride, padding, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    HCAT, WCAT = KH-1, KW-1

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    TH = H + 2*HPAD
    TW = W + 2*WPAD
    OH = (H + 2*HPAD - KH) // HSTR + 1
    OW = (W + 2*WPAD - KW) // WSTR + 1

    dshape = (1, CI, H, W)
    dpshape = (1, CI, TH, TW)
    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)

    kshape = (CO, CI, KH, KW)
    kvshape = (CO/VC, CI, KH, KW, VC)

    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
        data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')

    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
        kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) *
                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
                axis=[ci, dh, dw]), name='conv')

    output = tvm.compute(oshape, lambda n, co, h, w:
                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
                         name='output_unpack', tag='spatial_conv_output')

    return output
Beispiel #10
0
def test_tensor_reduce_multi_axis():
    m = tvm.var('m')
    n = tvm.var('n')
    A = tvm.placeholder((m, n), name='A')
    k1 = tvm.reduce_axis((0, n), "k")
    k2 = tvm.reduce_axis((0, m), "k")
    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2)))
    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
def test_verify_compute():
  n = tvm.var("n")
  m = tvm.var("m")
  A = tvm.placeholder((n, m), name='A')
  k = tvm.reduce_axis((0, m), "k")
  k_ = tvm.reduce_axis((0, m-1), "k_")
  f1 = lambda i: tvm.sum(A[i, k], axis=k)
  f2 = lambda i: A[i,0] + 1
  f3 = lambda i: tvm.sum(A[i, k], axis=k) + 1
  f4 = lambda i: A[i,0] * (tvm.sum(A[i, k], axis=k) + 1)
  f5 = lambda i: (tvm.sum(A[i, k], axis=k), A[i,0] + 1)
  f6 = lambda i: (tvm.sum(A[i, k], axis=k), tvm.sum(A[i, k_], axis=k_))

  #
  # Valid compute
  try:
    B = tvm.compute((n,), f1, name="B")
  except tvm._ffi.base.TVMError as ex:
    assert False

  #
  # Valid compute
  try:
    B = tvm.compute((n,), f2, name="B")
  except tvm._ffi.base.TVMError as ex:
    assert False

  #
  # Invalid compute with non top level reduction
  try:
    B = tvm.compute((n,), f3, name="B")
    assert False
  except tvm._ffi.base.TVMError as ex:
    pass

  #
  # Invalid compute with non top level reduction
  try:
    B = tvm.compute((n,), f4, name="B")
    assert False
  except tvm._ffi.base.TVMError as ex:
    pass

  #
  # Invalid compute with reduction and non-reduction batch ops
  try:
    B0, B1 = tvm.compute((n,), f5, name="B")
    assert False
  except tvm._ffi.base.TVMError as ex:
    pass

  #
  # Invalid compute with unequal batch reduction ops
  try:
    B0, B1 = tvm.compute((n,), f6, name="B")
    assert False
  except tvm._ffi.base.TVMError as ex:
    pass
Beispiel #12
0
def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
    """Transposed 2D convolution nchw forward operator.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    Filter : tvm.Tensor
        4-D with shape [in_channel, num_filter, filter_height, filter_width]

    strides : tuple of two ints
        The spatial stride along height and width

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    out_dtype : str
        The output data type. This is used for mixed precision.

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, in_c, in_h, in_w = Input.shape
    _, out_c, filter_h, filter_w = Filter.shape
    stride_h, stride_w = strides
    # dilate stage
    DilatedInput = dilate(Input, [1, 1, stride_h, stride_w], name='DilatedInput')
    # padding stage
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
    bpad_top = filter_h - 1 - fpad_top
    bpad_bottom = filter_h - 1 - fpad_bottom
    bpad_left = filter_w - 1 - fpad_left
    bpad_right = filter_w - 1 - fpad_right
    PaddedInput = pad(DilatedInput, \
                        [0, 0, bpad_top, bpad_left], \
                        [0, 0, bpad_bottom, bpad_right], \
                        name='PaddedInput')
    # convolution stage
    out_c = simplify(out_c)
    out_h = simplify((in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h)
    out_w = simplify((in_w - 1) * stride_w - fpad_left - fpad_right + filter_w)
    dc = tvm.reduce_axis((0, in_c), name='dc')
    dh = tvm.reduce_axis((0, filter_h), name='dh')
    dw = tvm.reduce_axis((0, filter_w), name='dw')

    Output = tvm.compute(
        (batch, out_c, out_h, out_w),
        lambda b, c, h, w: tvm.sum(
            PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) *
            Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype),
            axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")

    return Output
Beispiel #13
0
def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, stride, padding):
    """Depthwise convolution nhwc backward wrt input operator.

    Parameters
    ----------
    Filter : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

    Out_grad : tvm.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]

    stride : tuple of two ints
        The spatial stride along height and width

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]
    """
    batch, in_h, in_w, in_c = ishape
    _, out_h, out_w, out_c = oshape
    filter_h, filter_w, _, channel_multiplier = Filter.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name='dilated_out_grad')

    # padding params in forward propagation
    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
    # padding params in backward propagation
    bpad_top = filter_h - 1 - fpad_top
    bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1)
    bpad_left = filter_w - 1 - fpad_left
    bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)

    padded_out_grad = pad(dilated_out_grad, \
                                  [0, bpad_top, bpad_left, 0], \
                                  [0, bpad_bottom, bpad_right, 0], \
                                  name='padded_out_grad')

    dh = tvm.reduce_axis((0, filter_h), name='dh')
    dw = tvm.reduce_axis((0, filter_w), name='dw')
    dc = tvm.reduce_axis((0, channel_multiplier), name='dc')

    In_grad = tvm.compute(
        (batch, in_h, in_w, in_c),
        lambda b, h, w, c: tvm.sum(padded_out_grad[b, h+dh, w+dw, c*channel_multiplier + dc] * \
                                   Filter[filter_h-1-dh, filter_w-1-dw, c, dc],
                                   axis=[dh, dw, dc]), tag='depthwise_conv2d_backward_input_nhwc')

    return In_grad
Beispiel #14
0
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
    sch = _get_schedule(wkl, "NHWC")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc

    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
    N, H, W, IB, CI = data_q.shape
    OCO, KH, KW, KB, VC, _ = kernel_vec.shape

    CO = OCO * VC
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH-1, KW-1

    PAD_H = H + 2*HPAD
    PAD_W = W + 2*WPAD
    OH = (H + 2*HPAD - KH) // HSTR + 1
    OW = (W + 2*WPAD - KW) // WSTR + 1
    dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI)
    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
    oshape = (1, OH, OW, CO)

    if (HPAD != 0 and WPAD != 0):
        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
    else:
        data_pad = data_q

    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')
    ib = tvm.reduce_axis((0, IB), name='ib')
    kb = tvm.reduce_axis((0, KB), name='kb')

    def _conv(n, h, w, co, vh, vw, vc):
        return tvm.sum((tvm.popcount(
            kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') &
            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16'))
                        << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])

    conv = tvm.compute(ovshape, _conv, name='conv')

    return tvm.compute(oshape, lambda n, h, w, co:
                       conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
                       name='output_vec', tag='spatial_bitserial_conv_nhwc')
def test_reduce_simplify():
    ck = CanonicalChecker()
    k = tvm.reduce_axis((0, 10), name="k")
    j = tvm.reduce_axis((-5, 3), name="j")
    A = tvm.placeholder((10,), name='A')
    ck.verify(tvm.sum(tvm.expr.Select(k + j < 12, k + j, 0), [k, j]),
              tvm.sum(k + j, [k, j]))
    ck.verify(tvm.sum(A[3], []), A[3])
    # The rule below is not typical, removed for now
    ck.verify(tvm.sum(k / 10, k), tvm.sum(tvm.const(0, "int32"), k))
Beispiel #16
0
def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'):
    """Depthwise convolution nchw forward operator.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    Filter : tvm.Tensor
        4-D with shape [in_channel, channel_multiplier, filter_height, filter_width]

    stride : tuple of two ints
        The spatial stride along height and width

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    out_dtype = Input.dtype

    batch, in_channel, in_height, in_width = Input.shape
    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (filter_height, filter_width))
    out_channel = simplify(in_channel * channel_multiplier)
    out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1)

    # padding stage
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
    # depthconv stage
    di = tvm.reduce_axis((0, filter_height), name='di')
    dj = tvm.reduce_axis((0, filter_width), name='dj')
    Output = tvm.compute(
        (batch, out_channel, out_height, out_width),
        lambda b, c, i, j: tvm.sum(
            (PaddedInput[b, c/channel_multiplier, i*stride_h+di, j*stride_w+dj].astype(out_dtype) *
             Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)),
            axis=[di, dj]),
        name='DepthwiseConv2d', tag="depthwise_conv2d_nchw")
    return Output
Beispiel #17
0
def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
                                layout, out_layout, out_dtype=None):
    out_dtype = data.dtype if out_dtype is None else out_dtype
    batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
    out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \
        = get_const_tuple(kernel.shape)

    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    HSTR, WSTR = strides
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (filter_height, filter_width))

    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
    assert (dh, dw) == (1, 1), "Does not support dilation"

    in_channel = in_channel_chunk * in_channel_block
    out_channel = out_channel_chunk * out_channel_block
    channel_multiplier = out_channel // in_channel

    out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1
    out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1

    # get workload and related schedule config
    wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
                        tvm.placeholder((out_channel, in_channel, filter_height, filter_width),
                                        dtype=kernel.dtype),
                        strides, padding, out_dtype)
    if cfg.is_fallback:
        _fallback_schedule(cfg, wkl)

    # padding stage
    DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0)
    if DOPAD:
        pad_before = [0, 0, pad_top, pad_left, 0]
        pad_after = [0, 0, pad_down, pad_right, 0]
        data_pad = pad(data, pad_before, pad_after, name="PaddedInput")
    else:
        data_pad = data

    # depthconv stage
    kh = tvm.reduce_axis((0, filter_height), name='kh')
    kw = tvm.reduce_axis((0, filter_width), name='kw')
    Output = tvm.compute(
        (batch, out_channel_chunk, out_height, out_width, out_channel_block),
        lambda b, oco, oh, ow, oci: tvm.sum(
            (data_pad[b, (oco * out_channel_block + oci) // channel_multiplier // in_channel_block,
                      oh*HSTR+kh, ow*WSTR+kw,
                      ((oco * out_channel_block + oci) // channel_multiplier) % in_channel_block]
             .astype(out_dtype) *
             kernel[oco, 0, kh, kw, 0, oci].astype(out_dtype)),
            axis=[kh, kw]),
        name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc")
    return Output
Beispiel #18
0
def compute_conv(data, weight):
    N, IC, H, W = data.shape
    OC, IC, KH, KW = weight.shape
    OH = H - KH + 1
    OW = W - KW + 1

    ic = tvm.reduce_axis((0, IC), name='ic')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    return tvm.compute((N, OC, OH, OW), lambda i, oc, h, w: \
        tvm.sum(data[i, ic, h+dh, w+dw] * weight[oc, ic, dh, dw],
                axis=[ic, dh, dw]))
Beispiel #19
0
def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
    """Convolution operator in NHWC layout.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]

    Filter : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, num_filter]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_height,  out_width, out_channel]
    """
    assert isinstance(stride, int) or len(stride) == 2
    batch, in_height, in_width, in_channel = Input.shape
    kernel_h, kernel_w, channel, num_filter = Filter.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
    # compute the output shape
    out_channel = num_filter
    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
    Output = tvm.compute(
        (batch, out_height, out_width, out_channel),
        lambda nn, yy, xx, ff: tvm.sum(
            PaddedInput[nn, yy * stride_h + ry, xx * stride_w + rx, rc].astype(out_dtype) *
            Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
        name="Conv2dOutput", tag="conv2d_nhwc")
    return Output
Beispiel #20
0
def conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'):
    """Convolution operator in NCHW layout.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    Filter : tvm.Tensor
        4-D with shape [num_filter, in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    assert isinstance(stride, int) or len(stride) == 2
    batch, in_channel, in_height, in_width = Input.shape
    num_filter, channel, kernel_h, kernel_w = Filter.shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
    # compute the output shape
    out_channel = num_filter
    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(Input, pad_before, pad_after, name="pad_temp")
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')

    return tvm.compute(
        (batch, out_channel, out_height, out_width),
        lambda nn, ff, yy, xx: tvm.sum(
            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
            Filter[ff, rc, ry, rx].astype(out_dtype),
            axis=[rc, ry, rx]), tag="conv2d_nchw")
Beispiel #21
0
def conv2d_winograd_weight_transform(kernel, tile_size):
    """Weight transformation for winograd

    Parameters
    ----------
    kernel: Tensor
        The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now
    tile_size: int
        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [alpha, alpha, CO, CI]
    """
    K = 3

    shape = get_const_tuple(kernel.shape)
    assert shape[2:] == (K, K), "Only support 3x3 kernel"

    r = tile_size + K - 1
    shape = (r, r) + shape[:2]

    if tile_size == 2:
        G_data = np.array([
            [1, 0, 0],
            [1.0/2, 1.0/2, 1.0/2],
            [1.0/2, -1.0/2, 1.0/2],
            [0, 0, 1],
        ], dtype=kernel.dtype)
    elif tile_size == 4:
        G_data = np.array([
            [1 / 4.0, 0, 0],
            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
            [1 / 24.0, 1 / 12.0, 1 / 6.0],
            [1 / 24.0, -1 / 12.0, 1 / 6.0],
            [0, 0, 1]
        ], dtype=kernel.dtype)
    else:
        raise ValueError("Unsupoorted tile size:" + tile_size)

    G = const_matrix(G_data, 'G')
    r_kh = tvm.reduce_axis((0, K), name='r_kh')
    r_kw = tvm.reduce_axis((0, K), name='r_kw')
    return tvm.compute(shape, lambda eps, nu, co, ci:
                       tvm.sum(kernel[co][ci][r_kh][r_kw] *
                               G[eps][r_kh] * G[nu][r_kw],
                               axis=[r_kh, r_kw]), name='transform_weight')
Beispiel #22
0
def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, stride, padding):
    """Depthwise convolution nhwc backward wrt weight operator.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]

    Out_grad : tvm.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]

    stride : tuple of two ints
        The spatial stride along height and width

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
    """
    batch, out_h, out_w, out_c = oshape
    filter_h, filter_w, _, channel_multiplier = fshape
    in_c = Input.shape[3].value
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w))

    padded_in = pad(Input, \
                        [0, pad_top, pad_left, 0], \
                        [0, pad_bottom, pad_right, 0], \
                        name='padded_in')

    dh = tvm.reduce_axis((0, Out_grad.shape[1].value), name='dh')
    dw = tvm.reduce_axis((0, Out_grad.shape[2].value), name='dw')
    db = tvm.reduce_axis((0, batch), name='db')

    Weight_grad = tvm.compute(
        (filter_h, filter_w, in_c, channel_multiplier),
        lambda fh, fw, c, m: tvm.sum(
            Out_grad[db, dh, dw, c*channel_multiplier+m%channel_multiplier] *
            padded_in[db, fh+dh*stride_h, fw+dw*stride_w, c], axis=[db, dh, dw]),
        tag='depthwise_conv2d_backward_weight_nhwc')

    return Weight_grad
Beispiel #23
0
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
    if out_dtype is None:
        out_dtype = data.dtype
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)
    # create tuning space
    cfg.define_split("tile_y", batch, num_outputs=3)
    cfg.define_split("tile_x", out_dim, num_outputs=3)
    cfg.define_split("tile_k", in_dim, num_outputs=2)
    if cfg.is_fallback:
        _default_dense_pack_config(cfg, batch, out_dim, in_dim)

    packw_bn = cfg["tile_x"].size[-1]
    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
    packw = tvm.compute(packw_shape,
                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")

    k = tvm.reduce_axis((0, in_dim), name="k")
    C = tvm.compute((batch, out_dim),
                    lambda y, x: tvm.sum(
                        data[y, k].astype(out_dtype) *
                        packw[x // packw_bn, k, x % packw_bn].astype(out_dtype),
                        axis=k),
                    tag="dense_pack")
    if bias is not None:
        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)
    return C
Beispiel #24
0
def dense_default(data, weight, bias=None):
    """The default implementation of dense in topi.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim]

    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim]

    bias : tvm.Tensor, optional
        1-D with shape [out_dim]

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim]
    """
    assert len(data.shape) == 2 and len(weight.shape) == 2, \
        "only support 2-dim dense"
    if bias is not None:
        assert len(bias.shape) == 1
    batch, in_dim = data.shape
    out_dim, _ = weight.shape
    k = tvm.reduce_axis((0, in_dim), name='k')
    matmul = tvm.compute((batch, out_dim), \
                         lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
                         tag='dense')
    if bias is not None:
        matmul = tvm.compute((batch, out_dim), \
                             lambda i, j: matmul[i, j] + bias[j], \
                             tag=tag.BROADCAST)
    return matmul
Beispiel #25
0
def test_dot():
    nn = 12
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    k = tvm.reduce_axis((0, n), 'k')
    C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C')
    s = tvm.create_schedule(C.op)
    fapi = lower(s, [A, B, C])

    def verify(target):
        if not tvm.module.enabled(target):
            print("Target %s is not enabled" % target)
            return
        f = tvm.codegen.build_module(fapi, target)
        # verify
        ctx = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
        c  = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)

    verify("llvm")
Beispiel #26
0
def intrin_gemv(m, l):
    a = tvm.placeholder((l,), name='a')
    b = tvm.placeholder((m, l), name='b')
    k = tvm.reduce_axis((0, l), name='k')
    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
    Ab = tvm.decl_buffer(a.shape, a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[1])
    Bb = tvm.decl_buffer(b.shape, b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[tvm.var("s1"), 1])
    Cb = tvm.decl_buffer(c.shape, c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[1])
    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(tvm.call_extern("int32", "gemv_update",
                                cc.access_ptr("w"),
                                aa.access_ptr("r"),
                                bb.access_ptr("r"),
                                m, l, bb.strides[0]))
        return ib.get()
    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
Beispiel #27
0
def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Beispiel #28
0
def test_rfactor():
    n = tvm.convert(1027)
    A = tvm.placeholder((n,), name='A')
    k = tvm.reduce_axis((0, n))
    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')
    # schedule
    s = tvm.create_schedule(B.op)
    kf, ki = s[B].split(k, nparts=4)
    BF = s.rfactor(B, kf)
    s[BF].parallel(BF.op.axis[0])
    # one line to build the function.
    def check_target(target="llvm"):
        if not tvm.module.enabled(target):
            return
        ctx = tvm.cpu(0)
        fapi = tvm.lower(s, args=[A, B])
        fsum = tvm.build(fapi,
                         target=target,
                         name="mysum")
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
        b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
        fsum(a, b)
        res = np.sum(a.asnumpy(), axis=0)
        tvm.testing.assert_allclose(
            b.asnumpy(), res, rtol=1e-4)

    check_target()
def intrin_gemv(m, n):
    w = tvm.placeholder((m, n), name='w')
    x = tvm.placeholder((n,), name='x')
    k = tvm.reduce_axis((0, n), name='k')
    z = tvm.compute((m,), lambda i:
                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
    Wb = tvm.decl_buffer(w.shape, w.dtype,
                         name="W",
                         offset_factor=16,
                         strides=[tvm.var('ldw'), 1])
    def intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        ww_ptr = ww.access_ptr("r")
        xx_ptr = xx.access_ptr("r")
        zz_ptr = zz.access_ptr("w")
        body = tvm.call_packed(
            "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        reset = tvm.call_packed(
            "fill_zero", zz_ptr, n)
        update = tvm.call_packed(
            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        return body, reset, update

    with tvm.build_config(data_alignment=16,
                          offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func,
                                      binds={w: Wb})
Beispiel #30
0
    def get_gemm_feature(target):
        k = tvm.reduce_axis((0, N), 'k')
        A = tvm.placeholder((N, N), name='A')
        B = tvm.placeholder((N, N), name='B')
        C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
                        name='C')

        s = tvm.create_schedule(C.op)

        y, x = s[C].op.axis
        axes = list(s[C].tile(y, x, 8, 8)) + [k]
        perm = np.random.permutation(5)
        axes = [axes[x] for x in perm]
        s[C].reorder(*axes)

        if "gpu" in target.keys:
            pick = []
            # filter out reduction axis
            for i in range(len(perm)):
                if perm[i] != 4:
                    pick.append(axes[i])
            s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
            s[C].bind(pick[1], tvm.thread_axis("vthread"))
            s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))

        with target:
            feas = feature.get_itervar_feature(s, [A, B, C])
            feas = feature.flatten_itervar_feature(feas)
        return feas
def bitserial_dense_default(cfg,
                            data,
                            weight,
                            data_bits,
                            weight_bits,
                            pack_dtype='uint32',
                            out_dtype='int16',
                            unipolar=True):
    """Bitserial dense implementation. TODO: Why are these separate

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim]
    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim] or
        3-D with shape [out_dim, weight_bits, in_dim]
    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim]
    """
    data_packed = bitpack(data,
                          data_bits,
                          pack_axis=1,
                          bit_axis=1,
                          pack_type=pack_dtype)
    if len(weight.shape) == 2:
        weight_packed = bitpack(weight,
                                weight_bits,
                                pack_axis=1,
                                bit_axis=1,
                                pack_type=pack_dtype)
    else:
        weight_packed = weight
    Y, DB, K = get_const_tuple(data_packed.shape)
    X, WB, _ = get_const_tuple(weight_packed.shape)
    ######## Search space
    x, y = cfg.axis(X), cfg.axis(Y)
    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
    ko, ki = cfg.define_split('tile_k', k, policy='all', num_outputs=2)
    yo, yi = cfg.define_split('tile_y', y, policy='all', num_outputs=2)
    xo, xi = cfg.define_split('tile_x', x, policy='all', num_outputs=2)

    cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi],
                       policy='candidate',
                       candidate=[[yo, xo, ko, yi, wb, db, ki, xi],
                                  [yo, xo, yi, ko, wb, db, ki, xi]])

    cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll')
    cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec')

    ###### Compute rule
    VX = cfg['tile_x'].size[-1]

    wvshape = (X // VX, WB, VX, K)
    oshape = (Y, X)

    k = tvm.reduce_axis((0, K), name='k')
    db = tvm.reduce_axis((0, DB), name='db')
    wb = tvm.reduce_axis((0, WB), name='wb')

    # Tile data and weights
    weight_vec = tvm.compute(
        wvshape,
        lambda xo, wb, vx, k: weight_packed[xo * VX + vx][wb][k],
        name='weight_vec')

    matmul_unipolar = tvm.compute(
        oshape,
        lambda i, j: tvm.sum((tvm.popcount(weight_vec[
            j // VX, wb, j % VX, k] & data_packed[i, db, k]) - tvm.popcount(
                ~weight_vec[j // VX, wb, j % VX, k] & data_packed[i, db, k])).
                             astype(out_dtype) << (db + wb).astype(out_dtype),
                             axis=[wb, db, k]),
        tag='bitserial_dense_unipolar')

    matmul = tvm.compute(oshape,
                         lambda i, j: tvm.sum(tvm.popcount(weight_vec[
                             j // VX, wb, j % VX, k] & data_packed[
                                 i, db, k]).astype(out_dtype) <<
                                              (db + wb).astype(out_dtype),
                                              axis=[wb, db, k]),
                         tag='bitserial_dense')

    # binary ops
    cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype))

    if unipolar:
        return matmul_unipolar
    return matmul
Beispiel #32
0
def _decl_im2col(data,
                 kernel,
                 stride,
                 padding,
                 layout='NCHW',
                 out_dtype='float32'):
    """declare the Im2Col method for conv2d"""
    _, CI, IH, IW = [x.value for x in data.shape]
    CO, _, KH, KW = [x.value for x in kernel.shape]
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride

    N = 1
    OH = (IH + 2 * HPAD - KH) // HSTR + 1
    OW = (IW + 2 * WPAD - KW) // WSTR + 1

    DO_PAD = (HPAD != 0 and WPAD != 0)
    if DO_PAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    ALIGN = 16

    def upround(x, align):
        return (x + align - 1) // align * align

    # A [CO, CI * KH * KW]
    reduce_len = upround(CI * KH * KW, ALIGN)
    A = tvm.compute(
        (upround(CO, ALIGN), reduce_len),
        lambda i, j: kernel[i][j // KW // KH][j // KW % KH][j % KW],
        name='A')

    # B [CI * KH * KW, N * OH * OW]
    B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\
            tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW),
                       data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH]
                       [j % OW*WSTR + i % KW],
                       tvm.const(0, data_pad.dtype)), name='B')

    gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1]

    # C [CO, N * OH * OW]
    k = tvm.reduce_axis((0, gemm_l), name='k')
    C = tvm.compute((gemm_n, gemm_m),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')

    # output
    # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment,
    # otherwise the alignment above will be eliminated by bound inference
    output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\
                 C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1],
                         name='output', tag='im2col_conv_output')

    return output
Beispiel #33
0
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout,
                  out_dtype, pre_computed):
    """Compute declaration for winograd"""
    assert layout == 'NCHW'

    tile_size = _infer_tile_size(data, kernel)

    N, CI, H, W = get_const_tuple(data.shape)

    if not pre_computed:  # kernel tensor is raw tensor, do strict check
        if isinstance(dilation, int):
            dilation_h = dilation_w = dilation
        else:
            dilation_h, dilation_w = dilation
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))

        CO, CI, KH, KW = get_const_tuple(kernel.shape)
        HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
        HSTR, WSTR = (strides,
                      strides) if isinstance(strides, int) else strides
        assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
    else:  # kernel tensor is pre-transfomred. this op is created by
        # alter op layout, do not check
        # dilation is not supported
        HSTR = WSTR = 1
        HPAD = WPAD = 1
        KH = KW = 3
        _, _, CI, CO = get_const_tuple(kernel.shape)

    data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD),
                      name="data_pad")

    if tile_size == 4:
        G_data = np.array(
            [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0],
             [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0],
             [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]],
            dtype=np.float32)

        B_data = np.array(
            [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0],
             [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]],
            out_dtype)

        A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1],
                           [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]],
                          out_dtype)
    elif tile_size == 2:
        G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2],
                           [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]],
                          np.float32)

        B_data = np.array(
            [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]],
            out_dtype)

        A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " +
                         str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1
    H = (H + 2 * HPAD - KH) // HSTR + 1
    W = (W + 2 * WPAD - KW) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    # transform kernel
    if not pre_computed:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
        kernel_pack = tvm.compute(
            (alpha, alpha, CI, CO),
            lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps]
                                            [r_kh] * G[nu][r_kw],
                                            axis=[r_kh, r_kw]),
            name='kernel_pack')
    else:
        kernel_pack = kernel

    # pack input tile
    input_tile = tvm.compute((CI, P, alpha, alpha),
                             lambda c, p, eps, nu: data_pad[p // (nH * nW)][c][
                                 p // nW % nH * m + eps][p % nW * m + nu],
                             name='d')

    # transform data
    B = const_matrix(B_data)
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    data_pack = tvm.compute((alpha, alpha, CI, P),
                            lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][
                                r_a][r_b] * B[r_a][eps] * B[r_b][nu],
                                                           axis=[r_a, r_b]),
                            name='data_pack')

    # do batch gemm
    ci = tvm.reduce_axis((0, CI), name='ci')
    bgemm = tvm.compute((alpha, alpha, CO, P),
                        lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][
                            ci][co] * data_pack[eps][nu][ci][p],
                                                       axis=[ci]),
                        name='bgemm')

    # inverse transform
    A = const_matrix(A_data)
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    inverse = tvm.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: tvm.sum(
            bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='inverse')

    # output
    output = tvm.compute(
        (N, CO, H, W),
        lambda n, co, h, w: inverse[co][n * nH * nW +
                                        (h // m) * nW + w // m][h % m][w % m],
        name='output',
        tag='conv2d_nchw_winograd')
    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)

    return output
Beispiel #34
0
def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
                      pack_dtype='uint32', out_dtype='int16', unipolar=True):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
    data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
    # Check if kernel is already bitpacked
    if len(kernel.shape) == 4:
        kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
        KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
    else:
        kernel_vec = kernel
        OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape)
        CO = OCO * VC

    IB, N, CI, H, W = get_const_tuple(data_q.shape)
    KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)

    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
    else:
        TPAD, LPAD, DPAD, RPAD = padding
    pad_before = [0, 0, 0, TPAD, LPAD]
    pad_after = [0, 0, 0, DPAD, RPAD]

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH-1, KW-1

    TH = H + TPAD + DPAD
    TW = W + LPAD + RPAD
    OH = (H + TPAD + DPAD - KH) // HSTR + 1
    OW = (W + LPAD + RPAD - KW) // WSTR + 1

     # ==================== define configuration space ====================
    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)

    co, vc = cfg.define_split('tile_co', co, policy='all', num_outputs=2,
                              filter=lambda x: max(x.size[1:]) <= 16)
    oh, vh = cfg.define_split('tile_oh', oh, policy='all', num_outputs=2,
                              filter=lambda x: max(x.size[1:]) <= 16)
    ow, vw = cfg.define_split('tile_ow', ow, policy='all', num_outputs=2,
                              filter=lambda x: max(x.size[1:]) <= 16)
    cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')

    re_axes = cfg.define_reorder("reorder_0",
                                 [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci],
                                 policy='interval_all', interval=(6, 11))
    cfg.add_flop(2 * N * OH * OW * CO * CI * 8 * KH * KW) # these are actually binary ops
    # ====================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB)
    kvshape = (CO//VC, CI, KH, KW, KB, VC)
    ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    if (TPAD != 0 and RPAD != 0):
        data_pad = pad(data_q, (0, 0, 0, TPAD, LPAD), (0, 0, 0, DPAD, RPAD), name="data_pad")
    else:
        data_pad = data_q

    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
        data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')

    if len(kernel.shape) == 4:
        kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
            kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')
    b1 = tvm.reduce_axis((0, IB), name='ib')
    b2 = tvm.reduce_axis((0, KB), name='kb')

    def _conv(n, co, h, w, vh, vw, vc):
        b1b2 = (b1+b2).astype(out_dtype)
        if unipolar:
            return tvm.sum((tvm.popcount(
                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
                kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
                            tvm.popcount(
                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
                           axis=[ci, dh, dw, b1, b2])

        return tvm.sum((tvm.popcount(
            data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
            kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
                       axis=[ci, dh, dw, b1, b2])

    conv = tvm.compute(ovshape, _conv, name='conv_out')

    return tvm.compute(oshape, lambda n, co, h, w:
                       conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
                       name='conv_vec', tag='spatial_bitserial_conv_nchw')
Beispiel #35
0
with ScheduleProcHelper():
    env = nnpu.get_env()
    shape = (48, 48)
    insn_shape = (16, 16)

    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_n, 'a')
    b = tvm.placeholder(shape, dtype_n, 'b')
    
    sph = ScheduleProcHelper.current

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    k = tvm.reduce_axis((0, shape[1]), 'k')
    dot_shape = (shape[0], )
    dot_buf = tvm.compute(dot_shape, 
                lambda i: tvm.sum(a_buf[i, k].astype(dtype_w) * 
                                     b_buf[i, k].astype(dtype_w), k), 
                'dot_buf')
    sph.MarkScope(dot_buf, 'acc')
    
    res_buf = nnpu.utils.CopyAccToBuf(dot_buf, 'res')
    
    res_host, _ = nnpu.utils.CopyBufToH(res_buf, 'res')

    # tensorize
    s = nnpu.create_schedule(res_host.op)
    xo, ro, xi, ri = s[dot_buf].tile(dot_buf.op.axis[0], dot_buf.op.reduce_axis[0],
                                     insn_shape[0], insn_shape[1])
Beispiel #36
0
def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation,
                           layout, out_dtype):
    out_dtype = data.dtype if out_dtype is None else out_dtype
    assert layout == 'NCHW', "only support NCHW convolution for AVX"

    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(dilation, int):
        dilation_h, dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    HPAD, WPAD = padding
    HSTR, WSTR = strides

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
    out_height = (in_height + 2 * HPAD - dilated_kernel_h) // HSTR + 1
    out_width = (in_width + 2 * WPAD - dilated_kernel_w) // WSTR + 1

    # pack data
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    # fetch schedule
    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

    shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width)
    data_vec = tvm.compute(
        shape,
        lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w],
        name='data_vec')

    # pack kernel
    shape = (num_filter // oc_bn, in_channel // ic_bn, kernel_height,
             kernel_width, ic_bn, oc_bn)
    kernel_vec = tvm.compute(shape,
                             lambda CO, CI, h, w, ci, co: kernel[
                                 CO * oc_bn + co, CI * ic_bn + ci, h, w],
                             name='kernel_vec')

    # convolution
    oshape = (batch_size, num_filter // oc_bn, out_height, out_width, oc_bn)
    unpack_shape = (batch_size, num_filter, out_height, out_width)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
            n, ic // ic_bn, oh * HSTR + kh * dilation_h, ic % ic_bn, ow * WSTR
            + kw * dilation_w].astype(out_dtype) * kernel_vec[
                oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block].astype(
                    out_dtype),
                                                      axis=[ic, kh, kw]),
        name='conv')

    unpack = tvm.compute(unpack_shape,
                         lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn
                                                 ].astype(out_dtype),
                         name='output_unpack',
                         tag='conv2d_nchw')
    return unpack
Beispiel #37
0
def group_conv2d_nchw(Input,
                      Filter,
                      stride,
                      padding,
                      dilation,
                      groups,
                      out_dtype=None):
    """Group convolution operator in NCHW layout.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    Filter : tvm.Tensor
        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    dilation : int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    groups : int
        number of groups

    out_dtype : str
        The output type. This is used for mixed precision.

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel, in_height, in_width = get_const_tuple(Input.shape)
    num_filter, _, kernel_h, kernel_w = get_const_tuple(Filter.shape)

    assert in_channel % groups == 0, "input channels must divide group size"
    assert num_filter % groups == 0, "output channels must divide group size"

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
    # compute the output shape
    out_channel = num_filter
    out_height = simplify(
        (in_height -
         (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width -
         (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w +
        1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(Input, pad_before, pad_after, name="pad_temp")
    rc = tvm.reduce_axis((0, in_channel // groups), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
    return tvm.compute(
        (batch, out_channel, out_height, out_width),
        lambda nn, ff, yy, xx: tvm.sum(
            temp[nn, ff // (num_filter // groups) *
                 (in_channel // groups) + rc, yy * stride_h + ry * dilation_h,
                 xx * stride_w + rx * dilation_w].astype(out_dtype) * Filter[
                     ff, rc, ry, rx].astype(out_dtype),
            axis=[rc, ry, rx]),
        tag='group_conv2d_nchw')
Beispiel #38
0
def conv2d_NCHWc(data,
                 kernel,
                 stride,
                 padding,
                 dilation,
                 layout,
                 out_layout,
                 out_dtype='float32'):
    """Conv2D operator for nChw[x]c layout.

    Parameters
    ----------
    data : tvm.Tensor
        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

    kernel : tvm.Tensor
        6-D with shape
        [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
        in_channel_block, num_filter_block]

    stride : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding : int or a list/tuple of two ints
        padding size, or [pad_height, pad_width]

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    layout : str
        Input data layout

    out_layout : str
        Output data layout

    out_dtype : str
        output data type

    Returns
    -------
    output : tvm.Tensor
        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
    """
    # search platform specific declaration first
    # default declaration
    # layout and out_layout are not used here,
    # we keep them for debug convenience when dumping autotvm workload
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HPAD = pad_top + pad_down
    WPAD = pad_left + pad_right
    HSTR, WSTR = stride if isinstance(stride,
                                      (tuple, list)) else (stride, stride)
    dh, dw = dilation if isinstance(dilation,
                                    (tuple, list)) else (dilation, dilation)
    assert (dh, dw) == (1, 1), "Does not support dilation"

    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
    in_channel = ic_chunk * ic_bn
    if data.dtype == 'uint8':
        oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
            kernel.shape)
    else:
        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(
            kernel.shape)
    num_filter = oc_chunk * oc_bn

    # output shape
    out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1
    oshape = (n, oc_chunk, out_height, out_width, oc_bn)

    # DOPAD
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
    else:
        data_pad = data

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    if data.dtype == 'uint8':
        assert out_dtype == "int32", \
            "INT8 convolution requires input dtype = uint8 and output dtype=int32"
        # Intel performs dot product of 2 "4" Int8 values
        # Current implementation requires ic_bn to be a multiple of 4
        n_elems = 4
        assert ic_bn % n_elems == 0

        ic_outer = tvm.reduce_axis((0, in_channel // ic_bn), name='ic_outer')
        ic_f_inner = tvm.reduce_axis((0, ic_bn // n_elems), name='ic_f_inner')
        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
        return tvm.compute(
            oshape,
            lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(
                data_pad[n, ic_outer, oh * HSTR + kh, ow * WSTR + kw,
                         ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
                kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block,
                       ic_s_inner].astype(out_dtype),
                axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
            name='conv2d_NCHWc_int8',
            tag="conv2d_NCHWc_int8")
    # else: fp implementation
    return tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[
            n, ic // ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % ic_bn].astype(
                out_dtype) * kernel[oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn,
                                    oc_block],
                                                      axis=[ic, kh, kw]),
        name='conv2d_NCHWc',
        tag="conv2d_NCHWc")
Beispiel #39
0
def spatial_pack_nhwc(cfg,
                      data,
                      kernel,
                      stride,
                      padding,
                      in_bits,
                      weight_bits,
                      pack_dtype='uint32',
                      out_dtype='int16',
                      unipolar=True):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    data_q = bitpack(data,
                     in_bits,
                     pack_axis=3,
                     bit_axis=4,
                     pack_type=pack_dtype)
    pack_kernel = len(kernel.shape) == 4

    if pack_kernel:
        kernel_q = bitpack(kernel,
                           weight_bits,
                           pack_axis=2,
                           bit_axis=4,
                           pack_type=pack_dtype)
    else:
        kernel_q = kernel

    KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape)
    N, H, W, CI, IB = get_const_tuple(data_q.shape)

    if isinstance(padding, int) or (isinstance(padding, (tuple, list))
                                    and len(padding) == 2):
        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
    else:
        TPAD, LPAD, DPAD, RPAD = padding
    pad_before = [0, TPAD, LPAD, 0, 0]
    pad_after = [0, DPAD, RPAD, 0, 0]

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH - 1, KW - 1

    PAD_H = H + (TPAD + DPAD)
    PAD_W = W + (LPAD + RPAD)
    OH = (PAD_H - KH) // HSTR + 1
    OW = (PAD_W - KW) // WSTR + 1
    oshape = (1, OH, OW, CO)

    # ==================== define configuration space ====================
    n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO)
    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)

    co, vc = cfg.define_split('tile_co',
                              co,
                              num_outputs=2,
                              filter=lambda x: max(x.size[1:]) <= 16)
    oh, vh = cfg.define_split('tile_oh',
                              oh,
                              num_outputs=2,
                              filter=lambda x: max(x.size[1:]) <= 16)
    ow, vw = cfg.define_split('tile_ow',
                              ow,
                              num_outputs=2,
                              filter=lambda x: max(x.size[1:]) <= 16)
    cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
    cfg.define_reorder("reorder_0",
                       [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci],
                       policy='interval_all',
                       interval=(3, 7))
    # binary ops
    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW *
                 binary_op_multiplier(pack_dtype))
    # ====================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    dvshape = (1, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT,
               VW * WSTR + WCAT, CI, IB)
    kvshape = (CO, KH, KW, CI, VC, KB)
    ovshape = (1, OH, OW, CO, VH, VW, VC)
    oshape = (1, OH, OW, CO)

    if (DPAD != 0 and RPAD != 0):
        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
    else:
        data_pad = data_q

    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')

    kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
        kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')
    b1 = tvm.reduce_axis((0, IB), name='ib')
    b2 = tvm.reduce_axis((0, KB), name='kb')

    def _conv(n, h, w, co, vh, vw, vc):
        b1b2 = (b1 + b2).astype(out_dtype)
        if unipolar:
            return tvm.sum(
                ((tvm.popcount(
                    data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1]
                    & kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) -
                  tvm.popcount(
                      data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1]
                      & ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype))
                 << b1b2),
                axis=[dh, dw, ci, b1, b2])

        return tvm.sum(tvm.popcount(
            data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1]
            & kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
                       axis=[dh, dw, ci, b1, b2])

    conv = tvm.compute(ovshape, _conv, name='conv')

    idxdiv = tvm.indexdiv
    idxmod = tvm.indexmod
    return tvm.compute(
        oshape,
        lambda n, h, w, co: conv[n][idxdiv(h, VH)][idxdiv(w, VW)][idxdiv(
            co, VC)][idxmod(h, VH)][idxmod(w, VW)][idxmod(co, VC)],
        name='output_unpack',
        tag='spatial_bitserial_conv_nhwc')
Beispiel #40
0
def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
    """Convolution operator in NCHW layout.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    Filter : tvm.Tensor
        4-D with shape [num_filter, in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel, in_height, in_width = Input.shape
    num_filter, channel, kernel_h, kernel_w = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_channel = num_filter
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(Input, pad_before, pad_after, name="pad_temp")
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')

    return tvm.compute((batch, out_channel, out_height, out_width),
                       lambda nn, ff, yy, xx: tvm.
                       sum(temp[nn, rc, yy * stride_h + ry * dilation_h, xx *
                                stride_w + rx * dilation_w].astype(out_dtype) *
                           Filter[ff, rc, ry, rx].astype(out_dtype),
                           axis=[rc, ry, rx]),
                       tag="conv2d_nchw")
Beispiel #41
0
def gemm_int8(n, m, l):
    A = tvm.placeholder((n, l), name='A', dtype='int8')
    B = tvm.placeholder((m, l), name='B', dtype='int8')

    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype(
        'int32'), axis=k), name='C')

    cfg = autotvm.get_config()
    s = tvm.create_schedule(C.op)
    y, x = C.op.axis

    AA = s.cache_read(A, 'shared', [C])
    BB = s.cache_read(B, 'shared', [C])
    AL = s.cache_read(AA, 'local', [C])
    BL = s.cache_read(BB, 'local', [C])
    CC = s.cache_write(C, 'local')

    k = CC.op.reduce_axis[0]

    cfg.define_split('tile_k', cfg.axis(k), num_outputs=3,
                     filter=lambda entity: entity.size[2] == 4 and \
                     entity.size[0] * 2 >= entity.size[1])

    ko, kt, ki = cfg['tile_k'].apply(s, CC, k)

    s[CC].tensorize(ki, intrin_dp4a)

    block_x = tvm.thread_axis('blockIdx.x')
    block_y = tvm.thread_axis('blockIdx.y')
    thread_x = tvm.thread_axis('threadIdx.x')
    thread_y = tvm.thread_axis('threadIdx.y')

    def block_size_filter(entity):
        return entity.size[0] * 2 >= entity.size[1] * 2 and \
                entity.size[1] <= 16 and entity.size[3] <= 4
    cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter)
    cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter)
    by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y)
    bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x)

    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].bind(tyz, tvm.thread_axis('vthread'))
    s[C].bind(txz, tvm.thread_axis('vthread'))
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)

    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    s[CC].reorder(ko, kt, yo, xo, ki)
    s[CC].unroll(kt)

    for stage in [AL, BL]:
        s[stage].compute_at(s[CC], kt)
        _, xi = s[stage].split(stage.op.axis[1], factor=4)
        s[stage].vectorize(xi)
        s[stage].double_buffer()

    cfg.define_knob('storage_align', [16, 48])
    for stage in [AA, BB]:
        s[stage].storage_align(s[stage].op.axis[0],
                               cfg['storage_align'].val, 0)
        s[stage].compute_at(s[CC], ko)

        fused = s[stage].fuse(*s[stage].op.axis)
        ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2])
        tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2])
        _, xi = s[stage].split(xi, factor=16)

        s[stage].bind(ty, thread_y)
        s[stage].bind(tx, thread_x)
        s[stage].vectorize(xi)

    cfg.define_knob('auto_unroll_max_step', [512, 1500])
    s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[C].pragma(by, 'unroll_explicit', False)

    cfg.add_flop(n*m*l*2)
    return s, [A, B, C]
Beispiel #42
0
def bitserial_conv2d_nchw(data,
                          kernel,
                          stride,
                          padding,
                          activation_bits,
                          weight_bits,
                          pack_dtype='uint32',
                          out_dtype='int16',
                          unipolar=True):
    """Bitserial Conv2D operator.

    Parameters
    ----------
    input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    filter : tvm.Tensor
        4-D with shape [num_filter, in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding : int or a list/tuple of two or four ints
        padding size, [pad_height, pad_width], [pad_top, pad_left, pad_down, pad_right]

    activation_bits: int
        number of bits used for activations/input elements

    weight_bits: int
        number of bits used for weight elements

    out_dtype: str
        return type of convolution

    pack_dtype: str
        bit packing type

    unipolar: bool
        if binarization style is in unipolar 1/0 format, instead of bipolar -1/+1 format

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    assert isinstance(stride, int) or len(stride) == 2
    Input_q = bitpack(data,
                      activation_bits,
                      pack_axis=1,
                      bit_axis=2,
                      pack_type=pack_dtype)
    if len(filter.shape) == 4:
        Filter_q = bitpack(filter,
                           weight_bits,
                           pack_axis=1,
                           bit_axis=4,
                           pack_type=pack_dtype)
    else:
        Filter_q = filter
    batch, in_channel, activation_bits, in_height, in_width = Input_q.shape
    num_filter, _, kernel_h, kernel_w, weight_bits = Filter_q.shape

    if isinstance(padding, int) or (isinstance(padding, (tuple, list))
                                    and len(padding) == 2):
        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
    else:
        TPAD, LPAD, DPAD, RPAD = padding
    pad_before = [0, 0, 0, TPAD, LPAD]
    pad_after = [0, 0, 0, DPAD, RPAD]

    PadInput_q = pad(Input_q, pad_before, pad_after, name="pad_temp")
    # compute the output shape
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride
    out_channel = num_filter
    out_height = (in_height - kernel_h + TPAD + DPAD) // stride_h + 1
    out_width = (in_width - kernel_w + LPAD + RPAD) // stride_w + 1

    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
    b2 = tvm.reduce_axis((0, weight_bits), name='b2')

    if unipolar:

        def _conv(nn, ff, yy, xx):
            b1b2 = (b1 + b2).astype(out_dtype)
            return tvm.sum(
                ((tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry,
                                          xx * stride_w + rx]
                               & Filter_q[ff, rc, ry, rx, b2]) -
                  tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry,
                                          xx * stride_w + rx]
                               & ~Filter_q[ff, rc, ry, rx, b2])) <<
                 (b1b2)).astype(out_dtype),
                axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
    else:

        def _conv(nn, ff, yy, xx):
            b1b2 = (b1 + b2).astype(out_dtype)
            return tvm.sum(
                (tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry,
                                         xx * stride_w + rx]
                              & Filter_q[ff, rc, ry, rx, b2]) <<
                 (b1b2)).astype(out_dtype),
                axis=[rc, ry, rx, b2, b1]).astype(out_dtype)

    return tvm.compute((batch, out_channel, out_height, out_width),
                       _conv,
                       name="Conv2dOutput",
                       tag="bitserial_conv2d_nchw")
Beispiel #43
0
def _declaration_conv_NCHWc(cfg, data, kernel, strides, padding, dilation,
                            layout, out_layout, out_dtype):
    # layout and out_layout are not used here,
    # we keep them for debug convenience when dumping autotvm workload
    HPAD, WPAD = padding if isinstance(padding,
                                       (tuple, list)) else (padding, padding)
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    dh, dw = dilation if isinstance(dilation,
                                    (tuple, list)) else (dilation, dilation)
    assert (dh, dw) == (1, 1), "Does not support dilation"

    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
    in_channel = ic_chunk * ic_bn
    if data.dtype == 'uint8':
        oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
            kernel.shape)
    else:
        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(
            kernel.shape)
    num_filter = oc_chunk * oc_bn

    if cfg.is_fallback:
        _get_default_config(
            cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
            tvm.placeholder(
                (num_filter, in_channel, kernel_height, kernel_width),
                dtype=kernel.dtype), strides, padding, out_dtype)

    # output shape
    out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1
    oshape = (n, oc_chunk, out_height, out_width, oc_bn)

    # DOPAD
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
    else:
        data_pad = data

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    if data.dtype == 'uint8':
        assert out_dtype == "int32", \
            "INT8 convolution requires input dtype = uint8 and output dtype=int32"
        # Intel performs dot product of 2 "4" Int8 values
        # Current implementation requires ic_bn to be a multiple of 4
        n_elems = 4
        assert ic_bn % n_elems == 0

        ic_outer = tvm.reduce_axis((0, in_channel // ic_bn), name='ic_outer')
        ic_f_inner = tvm.reduce_axis((0, ic_bn // n_elems), name='ic_f_inner')
        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
        return tvm.compute(
            oshape,
            lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(
                data_pad[n, ic_outer, oh * HSTR + kh, ow * WSTR + kw,
                         ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
                kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block,
                       ic_s_inner].astype(out_dtype),
                axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
            name='conv2d_NCHWc_int8',
            tag="conv2d_NCHWc_int8")
    # else: fp implementation
    return tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[
            n, ic // ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % ic_bn].astype(
                out_dtype) * kernel[oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn,
                                    oc_block],
                                                      axis=[ic, kh, kw]),
        name='conv2d_NCHWc',
        tag="conv2d_NCHWc")
Beispiel #44
0
import tvm
import numpy as np

######################################################################
# Define Matrix Multiplication
# ----------------------------
# Take matrix multiplication as our example.
# Matmul first multiply the corresponding elements between two matrix,
# then accumulate across a certain axis.
# The following lines describe the computation :code:`A * B^T` in TVM.
#
N, M, L = 1024, 512, 64
A = tvm.placeholder((N, L), name='A')
B = tvm.placeholder((M, L), name='B')
k = tvm.reduce_axis((0, L), name='k')
C = tvm.compute((N, M),
                lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k),
                name='C')
s = tvm.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

######################################################################
# Schedule the Matmul
# -------------------
# Now, suppose we have an accelerator that supports
# matrix-vector multiplication (GEMV) as a hardware primitive,
# which can take arbitrary size of reduce axis,
# but another axis needs to be no larger than 16.
# Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV.
#
Beispiel #45
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (2, 2, 16)
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, 'a')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    k = tvm.reduce_axis((0, 2), 'k')
    add_buf = tvm.compute(
        (2, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'add_buf')
    sph.MarkScope(add_buf)
    add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph)

    k1 = tvm.reduce_axis((0, 2), 'k1')
    mul_buf = tvm.compute(
        (2, 16), lambda i, j: tvm.sum(a_buf[k1, i, j], axis=k1), 'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    s = tvm.create_schedule([add_host.op, mul_host.op])
    sph.Transform(s)

    ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1)
    s[add_buf].reorder(ko, ki, *(s[add_buf].op.axis))
    s[add_buf].tensorize(ki, env.intrins.get('MAddMerge',
                                             shape=shape,
                                             mode='w'))

    ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1)
    s[mul_buf].reorder(ko1, ki1, *(s[mul_buf].op.axis))
    s[mul_buf].tensorize(ki1,
                         env.intrins.get('MMulMerge', shape=shape, mode='w'))

    print(nnpu.lower(s, [a, add_host, mul_host], simple_mode=True))

    func = nnpu.build(s, [a, add_host, mul_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')
    #exit()
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(2, 2, 16), dtype=a.dtype, low=-16, high=16)
    a_nd = tvm.nd.array(a_np, ctx)

    add_nd = tvm.nd.array(np.zeros((2, 16)).astype(add_host.dtype), ctx)

    mul_nd = tvm.nd.array(np.zeros((2, 16)).astype(mul_host.dtype), ctx)

    func(a_nd, add_nd, mul_nd)

    print('a = ')
    print(a_np)
    print('reduce sum row = ')
    print(add_nd.asnumpy())
    print('ground truth is: ')
    gt = np.sum(a_np, axis=0)
    print(gt)
    np.testing.assert_allclose(add_nd.asnumpy(), gt)

    print('reduce mul row = ')
    print(mul_nd.asnumpy())
    gt = np.multiply.reduce(a_np, axis=0, dtype=a.dtype)
    print(gt)
    np.testing.assert_allclose(mul_nd.asnumpy(), gt)
                               'a = np.random.rand(M, K).astype(dtype)\n'
                               'b = np.random.rand(K, N).astype(dtype)\n',
                               stmt='answer = np.dot(a, b)',
                               number=np_repeat)
print("Numpy running time: %f" % (np_runing_time / np_repeat))

# ground truth
a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx)
b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx)
c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx)
answer = np.dot(a.asnumpy(), b.asnumpy())

###################
# TVM part
# Algorithm
k = tvm.reduce_axis((0, K), 'k')
A = tvm.placeholder((M, K), name='A')
B = tvm.placeholder((K, N), name='B')
C = tvm.compute((M, N),
                lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
                name='C')

# Default schedule
s = tvm.create_schedule(C.op)
func = tvm.build(s, [A, B, C], target=target, name='mmult')
print(tvm.lower(s, [A, B, C], simple_mode=True))

func(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)

evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
Beispiel #47
0
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
    A = tvm.placeholder((n, l), name='A', dtype='float16')
    B = tvm.placeholder((l, m), name='B', dtype='float16')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda i, j: tvm.sum(
        A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k))
    s = tvm.create_schedule(C.op)
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    AA = s.cache_read(A, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BB = s.cache_read(B, "shared", [C])
    BL = s.cache_read(BB, "local", [C])
    CL = s.cache_write(C, "local")

    bx = 4
    by = 32
    step_k = 8
    v = 4
    TX = 8
    TY = 1
    tile_x = bx * TX
    tile_y = by * TY
    WX = min(warp_tile_m, tile_x)
    tile_k = 16
    vthread = 1

    yo, ty = s[C].split(y, tile_y * vthread)
    vy, ty = s[C].split(ty, tile_y)
    ty, yi = s[C].split(ty, TY)

    xo, xi = s[C].split(x, tile_x)
    tz, xi = s[C].split(xi, WX)
    tx, xi = s[C].split(xi, TX)
    ko, ki = s[CL].split(k, step_k * tile_k)
    kl, ki = s[CL].split(ki, tile_k)

    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy"))
    s[CL].compute_at(s[C], tx)
    yo, xo = CL.op.axis
    s[CL].reorder(ko, kl, ki, yo, xo)

    s[AA].compute_at(s[CL], ko)
    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
    tx, vec = s[AA].split(tx, factor=v)
    fused = s[AA].fuse(s[AA].op.axis[0], xo)
    _, ty = s[AA].split(fused, factor=by)
    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[AA].vectorize(vec)

    s[BB].compute_at(s[CL], ko)
    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
    tx, vec = s[BB].split(tx, factor=v)
    fused = s[BB].fuse(s[BB].op.axis[0], xo)
    _, ty = s[BB].split(fused, factor=by)
    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[BB].vectorize(vec)

    s[AL].compute_at(s[CL], kl)
    s[BL].compute_at(s[CL], kl)

    s[CL].pragma(ko, 'tensor_core')

    func = tvm.build(s, [A, B, C], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
    c_np = np.zeros((n, m), dtype=np.float32)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('gemm m=%d n=%d k=%d: %f ms' %
          (m, n, l, evaluator(a, b, c).mean * 1e3))

    c_np = np.dot(a_np, b_np)
    np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation,
                       out_dtype, num_tile):
    out_dtype = out_dtype or data.dtype

    N, C, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        pre_packed = False
        C, M, KH, KW = get_const_tuple(kernel.shape)
    else:  # kernel tensor is pre packed
        pre_packed = True
        C, M, KH, KW, VC = get_const_tuple(kernel.shape)
        C = C * VC

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    # pack data
    HPAD = pad_top + pad_down
    WPAD = pad_left + pad_right
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, pad_top, pad_left),
                       (0, 0, pad_down, pad_right),
                       name="data_pad")
    else:
        data_pad = data

    # fallback support
    # Currently, Mali schedule doesn't use it like conv2d.
    if cfg.is_fallback:
        ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399',
                                                    'depthwise_conv2d_nchw',
                                                    'contrib_spatial_pack')
        cfg.fallback_with_reference_log(ref_log)

    # ==================== define configuration space ====================
    n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW)
    kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    # Currently, Mali schedule doesn't use it like conv2d.
    # Leave num_tile for possible future use of Mali schedule
    if num_tile == 2:  # for arm cpu
        co, vc = cfg.define_split('tile_co', c, num_outputs=2)
        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
    else:
        raise RuntimeError("Invalid num_tile")

    cfg.define_reorder("reorder_0", [n, co, oh, ow, kh, kw, vh, vw, vc],
                       policy='candidate',
                       candidate=[[n, co, oh, ow, kh, kw, vh, vw, vc],
                                  [n, co, oh, ow, kh, kw, vc, vh, vw]])

    cfg.define_reorder("reorder_1", [n, co, oh, ow, vh, vw, vc],
                       policy='candidate',
                       candidate=[[n, co, oh, ow, vh, vw, vc],
                                  [n, co, oh, ow, vc, vh, vw],
                                  [n, co, oh, ow, vh, vc, vw]])

    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
    # ====================================================================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    kvshape = (C // VC, M, KH, KW, VC)
    ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, C * M, OH, OW)

    if dilation_h != 1 or dilation_w != 1:
        # undilate input data
        dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW)
        data_vec = tvm.compute(
            dvshape,
            lambda n, h, w, c, kh, kw, vh, vw: data_pad[n][c][
                (h * VH + vh) * HSTR + kh * dilation_h][
                    (w * VW + vw) * WSTR + kw * dilation_w],
            name='data_vec_undilated')
    else:
        dvshape = (N, OH // VH, OW // VW, C, VH * HSTR + KH - 1,
                   VW * WSTR + KW - 1)
        data_vec = tvm.compute(dvshape,
                               lambda n, h, w, c, vh, vw: data_pad[n][c][
                                   h * VH * HSTR + vh][w * VW * WSTR + vw],
                               name='data_vec')

    if pre_packed:
        kernel_vec = kernel
    else:
        kernel_vec = tvm.compute(
            kvshape,
            lambda co, m, kh, kw, vc: kernel[co * VC + vc][m][kh][kw],
            name='kernel_vec')

    kh = tvm.reduce_axis((0, KH), name='kh')
    kw = tvm.reduce_axis((0, KW), name='kw')

    if dilation_h != 1 or dilation_w != 1:
        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
                          tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, kh, kw, vh, vw]
                                  .astype(out_dtype) *
                                  kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype),
                                  axis=[kh, kw]), name='depthwise_conv')
    else:
        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
                           tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, vh * HSTR + kh,
                                            vw * WSTR + kw].astype(out_dtype) *
                                   kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype),
                                   axis=[kh, kw]), name='depthwise_conv')

    output = tvm.compute(oshape,
                         lambda n, co, h, w: conv[n][co // VC][h // VH][
                             w // VW][h % VH][w % VW][co % VC],
                         name='output_unpack',
                         tag='spatial_depthwise_conv_nchw_output')
    return output
Beispiel #49
0
def measure_bandwidth_sum(total_item, item_per_thread, stride, base_type, bits,
                          lanes, target, target_host, remote, ctx, n_times):
    """ measure memory bandwidth of gpu by product reduction for a given type

    The IR for measurement is

    for each thread
        for i in 1..num_per_thread:
            y[global_id] = y[global_id] * x[base + i * stride]

    Parameters
    ----------
    total_item: int
        number of elements in input array
    item_per_thread: int
        number of elements each thread accumulates
    stride: int
        stride in memory access
    base_type: str
        can be "int", "float"
    bits: int
        can be 16, 32
    lanes: int
       lane of the vector type, can be 1, 2, 4, 8, 16
    target: :any:`tvm.target.Target`
        the target and option of the compilation.
    target_host : str or :any:`tvm.target.Target`
        host compilation target
    ctx: TVMcontext
        the context of array
    remote: tvm.rpc.RPCSession
        remote rpc session
    n_times: int
        number of runs for taking mean

    Returns
    -------
    GBPS: float
         gigabyte per second
    """
    n, m = total_item, item_per_thread
    n //= lanes

    base_type = str(base_type) + str(bits)
    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)

    k = tvm.reduce_axis((0, m), name="k")

    x = tvm.placeholder((n, ), dtype=dtype, name="x")
    op = tvm.comm_reducer(lambda x, y: x * y,
                          lambda t: tvm.const(1, dtype=t),
                          name="sum")
    y = tvm.compute((n // m, ), lambda i: op(
        x[i // stride * stride * m + i % stride + k * stride], axis=k))
    s = tvm.create_schedule(y.op)

    yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
    s[y].bind(yo, tvm.thread_axis("blockIdx.x"))
    s[y].bind(yi, tvm.thread_axis("threadIdx.x"))
    s[y].unroll(k)

    try:
        func = tvm.build(s, [x, y], target, target_host=target_host)

        x = tvm.nd.empty((n, ), dtype=dtype, ctx=ctx)
        y = tvm.nd.empty((n // m, ), dtype=dtype, ctx=ctx)

        func = _convert_to_remote(func, remote)
        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
        time = time_f(x, y).mean
    except tvm._ffi.base.TVMError:
        # build error (occur when device does not support half)
        return -1

    return 1.0 * (total_item * bits / 8) / 1e9 / time
Beispiel #50
0
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits,
                       weight_bits, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
    sch = _get_schedule(wkl, "NHWC")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc

    data_q = bitpack(data,
                     activation_bits,
                     pack_axis=3,
                     bit_axis=3,
                     pack_type='uint8')
    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
    N, H, W, IB, CI = data_q.shape
    OCO, KH, KW, KB, VC, _ = kernel_vec.shape

    CO = OCO * VC
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH - 1, KW - 1

    PAD_H = H + 2 * HPAD
    PAD_W = W + 2 * WPAD
    OH = (H + 2 * HPAD - KH) // HSTR + 1
    OW = (W + 2 * WPAD - KW) // WSTR + 1
    dvshape = (N, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT,
               VW * WSTR + WCAT, IB, CI)
    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
    oshape = (1, OH, OW, CO)

    if (HPAD != 0 and WPAD != 0):
        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
    else:
        data_pad = data_q

    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')
    ib = tvm.reduce_axis((0, IB), name='ib')
    kb = tvm.reduce_axis((0, KB), name='kb')

    def _conv(n, h, w, co, vh, vw, vc):
        return tvm.sum(
            (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16')
                          & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw,
                                     ib, ci].astype('uint16')) <<
             (kb + ib).astype('uint16')),
            axis=[dh, dw, kb, ib, ci])

    conv = tvm.compute(ovshape, _conv, name='conv')

    return tvm.compute(oshape,
                       lambda n, h, w, co: conv[n][h // VH][w // VW][co // VC][
                           h % VH][w % VW][co % VC].astype(out_dtype),
                       name='output_vec',
                       tag='spatial_bitserial_conv_nhwc')
Beispiel #51
0
def comm_reduce(data,
                axis=None,
                keepdims=False,
                func=tvm.sum,
                is_idx_reduce=False):
    """Reducing the data

    Parameters
    ----------
    data : tvm.Tensor
        The input data

    axis : None or int or tuple of int
        Axis or axes along which a sum is performed.
        The default, axis=None, will sum all of the elements of the input array.
        If axis is negative it counts from the last to the first axis.

    keepdims : bool
        If this is set to True, the axes which are reduced are left in the result as dimensions
         with size one.
        With this option, the result will broadcast correctly against the input array.

    func : function
        functions like tvm.sum, tvm.max, tvm.min

    Returns
    -------
    ret : tvm.Tensor
    """
    ndim = len(data.shape)
    assert ndim != 0, "Reduce a dim-0 input is not supported!"
    real_axis = _get_real_axis(ndim, axis)
    reduce_axes = [
        tvm.reduce_axis((0, data.shape[i]), "k%d" % i) for i in real_axis
    ]
    if keepdims:
        target_shape = [
            1 if i in real_axis else data.shape[i] for i in range(ndim)
        ]
    else:
        target_shape = []
        for i in range(ndim):
            if i not in real_axis:
                target_shape.append(tvm.convert(data.shape[i]))

    def _compute(*indices):
        eval_range = []
        eval_indices = []
        if not keepdims:
            arg_counter = 0
        else:
            arg_counter = None
        red_counter = 0
        for i in range(len(data.shape)):
            if i in real_axis:
                eval_range.append(reduce_axes[red_counter])
                eval_indices.append(reduce_axes[red_counter].var)
                red_counter += 1
            else:
                if not keepdims:
                    eval_range.append(indices[arg_counter])
                    arg_counter += 1
                else:
                    eval_range.append(indices[i])
        if not is_idx_reduce:
            return func(data[tuple(eval_range)], axis=reduce_axes)
        idx = ravel_index(eval_indices, [data.shape[i] for i in real_axis])
        return func((idx, data[tuple(eval_range)]), axis=reduce_axes)

    if is_idx_reduce:
        temp_idx, temp_val = tvm.compute(target_shape,
                                         _compute,
                                         name=data.name + "_red_temp")
        out = tvm.compute(
            target_shape,
            lambda *indices: _choose_idx(temp_idx, temp_val, *indices),
            name=data.name + "_red")
    else:
        out = tvm.compute(target_shape, _compute, name=data.name + "_red")
    return out
Beispiel #52
0
def _decl_direct(data, kernel, stride, padding, layout, out_dtype):
    """declare the direct method (spatial packing) for conv2d"""
    _, CI, IH, IW = [util.get_const_int(x) for x in data.shape]
    CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape]
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
    HCAT, WCAT = KH - 1, KW - 1

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride

    N = 1
    TH = IH + 2 * HPAD
    TW = IW + 2 * WPAD
    OH = (IH + 2 * HPAD - KH) // HSTR + 1
    OW = (IW + 2 * WPAD - KW) // WSTR + 1

    DO_PAD = (HPAD != 0 and WPAD != 0)
    if DO_PAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    # set tunable parameters (tile factor, ...)
    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
    if tune_config is None:
        VH = 1
        VW, VC = 4, 4
        # correct tile factor
        if OW % VW != 0:
            if OW == 14:
                VW = 2
                VC = 8
            elif OW == 7:
                VW = 7
    else:
        VH = tune_config['VH']
        VW = tune_config['VW']
        VC = tune_config['VC']

    if data.dtype == 'float16':
        VC *= 2

    assert CO % VC == 0
    assert OH % VH == 0, "OH: %d  VH : %d" % (OH, VH)
    assert OW % VW == 0, "OW: %d  VW : %d" % (OW, VW)

    dvshape = (N, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT,
               VW * WSTR + WCAT)
    kvshape = (CO // VC, CI, KH, KW, VC)
    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, CO, OH, OW)

    data_vec = tvm.compute(dvshape,
                           lambda n, h, w, ci, vh, vw: data_pad[n][ci][
                               h * VH * HSTR + vh][w * VW * WSTR + vw],
                           name='data_vec')

    kernel_vec = tvm.compute(
        kvshape,
        lambda co, ci, kh, kw, vc: kernel[co * VC + vc][ci][kh][kw],
        name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    kh = tvm.reduce_axis((0, KH), name='kh')
    kw = tvm.reduce_axis((0, KW), name='kw')

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc:\
                tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
                        kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
                        axis=[ci, kh, kw]), name='conv')

    output = tvm.compute(oshape,
                         lambda n, co, h, w: conv[n][co // VC][h / VH][w // VW]
                         [h % VH][w % VW][co % VC],
                         name='output_unpack',
                         tag='direct_conv_output')

    return output
Beispiel #53
0
def matmul():
    # Algorithm
    k = tvm.reduce_axis((0, K), 'k')
    """Create a new IterVar for reduction.
     Parameters
     ----------
     dom : Range
         The domain of iteration.
     name : str
         The name of the variable.
     Returns
     -------
     axis : IterVar
         An iteration variable representing the value.
     """
    A = tvm.placeholder((M, K), name='A')
    B = tvm.placeholder((K, N), name='B')

    ##### define space begin #####
    cfg = autotvm.get_config()
    """Get current config object
      Returns
      -------
      cfg: ConfigSpace or ConfigEntity
          The current config
      """
    cfg.define_split("tile_x", M, num_outputs=2)
    cfg.define_split("tile_y", N, num_outputs=2)
    cfg.define_split("tile_k", K, num_outputs=2)
    """Define a new tunable knob which splits an axis into a list of axes
    Parameters
    ----------
    name: str
        name to index the entity of this space
    axis: tvm.schedule.IterVar
        axis to split
    policy: str
        name of policy.
        If is 'factors', the tuner will try all divisible factors.
        If is 'power2', the tuner will try power-of-two factors less or equal to the length.
        If is 'verbose', the tuner will try all candidates in above two policies.
        If is 'candidate', try given candidates.
    kwargs: dict
        extra arguments for policy
        max_factor: int
            the maximum split factor.
        filter: function(int) -> bool
            see examples below for how to use filter.
        num_outputs: int
            the total number of axis after split.
        no_tail: bool
            should we only include divisible numbers as split factors.
        candidate: list
            (policy=candidate) manual candidate list.

    Examples
    --------
    >>> # use custom candidates
    >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])

    >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
    >>> cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4)
    """

    ##### define space end #####

    # We have to re-write the algorithm slightly.
    #print("cfg[tile_y]",cfg["tile_y"])#打印tile_y的候选空间,如[-1,128]
    xn = cfg["tile_x"].size[-1]
    bn = cfg["tile_y"].size[-1]  #只打印列表里的最后一个,如上面的128
    kn = cfg["tile_k"].size[-1]
    #print("xn:",xn,"bn:",bn,"kn:",kn)

    packedB = tvm.compute((N / bn, K, bn),
                          lambda x, y, z: B[y, x * bn + z],
                          name='packedB')
    """Construct a new tensor by computing over the shape domain.
        The compute rule is result[axis] = fcompute(axis)
        Parameters
        ----------
        shape: Tuple of Expr
            The shape of the tensor
        fcompute: lambda function of indices-> value
            Specifies the input source expression
        name: str, optional
            The name hint of the tensor
        tag: str, optional
            Additional tag information about the compute.
        attrs: dict, optional
            The additional auxiliary attributes about the compute.
        Returns
        -------
        tensor: Tensor
            The created tensor
        """
    #" // " 表示整数除法,返回不大于结果的一个最大的整数
    C = tvm.compute(
        (M, N),
        lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, y % bn], axis=k),
        name='C')

    s = tvm.create_schedule(C.op)
    """Create a schedule for list of ops
      Parameters
      ----------
      ops : list of Operations
          The source expression.
      Returns
      -------
      sch : schedule.Schedule
          The created schedule.
      """
    x, y = s[C].op.axis
    k, = s[C].op.reduce_axis
    #print("x:", (x))#x: iter_var(x, range(min=0, ext=1024))

    # schedule according to config
    # Allocate write cache
    CC = s.cache_write(C, 'global')
    '''
    在存储到tensor之前,创建原始tensor的缓存写入。这会使张量体发生变异。
在传入张量之前,将创建一个新的缓存阶段。此函数可用于支持数据布局转换。
如果在张量的数据平行轴上存在分裂/融合/重新排序在调用缓存写入之前。中间缓存存储
布局中的数据作为离开轴的迭代顺序。数据将转换回原始张量中的原始布局。用户可以进一步调用
compute_inline以内联原始布局并保持存储在转换后的布局中的数据。
 Parameters
        ----------
        tensor : Tensor, list or tuple
            The tensors to be feed to. All the tensors must be produced by one computeOp
        scope : str
            The scope of cached
        Returns
        -------
        cache : Tensor
            The created cache tensor.
        """
    '''
    xo, xi = cfg["tile_x"].apply(s, C, x)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    s[C].reorder(xo, yo, xi, yi)

    # Write cache is computed at yo
    s[CC].compute_at(s[C], yo)
    """Attach the stage at parent's scope
           Parameters
           ----------
           parent : Stage
               The parent stage
           scope : IterVar
               The loop scope t be attached to.
           """
    # New inner axes
    xc, yc = s[CC].op.axis
    k, = s[CC].op.reduce_axis

    ko, ki = cfg["tile_k"].apply(s, CC, k)
    s[CC].reorder(ko, xc, ki, yc)
    s[CC].unroll(ki)
    """Unroll the iteration.
            Parameters
            ----------
            var : IterVar
                The iteration to be unrolled.
            """

    s[CC].vectorize(yc)
    """Vectorize the iteration.
           Parameters
           ----------
           var : IterVar
               The iteration to be vectorize
           """
    # parallel
    s[C].parallel(xo)
    """Parallelize the iteration.
          Parameters
          ----------
          var : IterVar
              The iteration to be parallelized.
          """
    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    return s, [A, B, C]
Beispiel #54
0
def test_gemm():
    # graph
    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute(
        (n, m),
        lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
        name='CC')
    # schedule
    s = tvm.create_schedule(C.op)
    xtile, ytile = 32, 32
    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis("threadIdx.y")

    CC = s.cache_write(C, "local")
    AA = s.cache_read(A, "shared", [CC])
    BB = s.cache_read(B, "shared", [CC])
    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].reorder(by, bx, yi, xi)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    ty, yi = s[C].split(yi, nparts=num_thread)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].reorder(ty, tx, yi, xi)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)


    s[CC].compute_at(s[C], tx)
    s[AA].compute_at(s[CC], k)
    s[BB].compute_at(s[CC], k)
    s[AA].double_buffer()
    s[BB].double_buffer()
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)

    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)

    # lowering test
    s = s.normalize()

    # one line to build the function.
    def check_device(device):
        if not tvm.module.enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        f = tvm.build(s, [A, B, C], device)
        ctx = tvm.context(device, 0)
        # launch the kernel.
        n = nn
        m = n
        l = n
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
        tcost = ftimer(a, b, c).mean
        print("%s: exec=%g sec/op" % (ctx, tcost))
        np.testing.assert_allclose(
            c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)

    check_device("nvptx -mcpu=sm_20")
    check_device("rocm")
    check_device("metal")
    check_device("opencl")
    check_device("cuda")
Beispiel #55
0
def _intrin_popcount(m, k_i, w_b, x_b):
    dtype = 'uint8'
    w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w')
    x = tvm.placeholder((
        x_b,
        k_i,
    ), dtype=dtype, name='x')
    k = tvm.reduce_axis((0, k_i), name='k')
    bw = tvm.reduce_axis((0, w_b), name='bw')
    bx = tvm.reduce_axis((0, x_b), name='bx')
    z = tvm.compute((m, ),
                    lambda i: tvm.sum(tvm.popcount(w[bw, i, k].astype(
                        'uint16') & x[bx, k].astype('uint16')) <<
                                      (bw + bx).astype('uint16'),
                                      axis=[bw, bx, k]),
                    name='z')

    Wb = tvm.decl_buffer(w.shape,
                         w.dtype,
                         name="W",
                         offset_factor=k_i,
                         strides=[tvm.var('ldw'),
                                  tvm.var('ldw'), 1])
    Xb = tvm.decl_buffer(x.shape,
                         x.dtype,
                         name="X",
                         offset_factor=k_i,
                         strides=[tvm.var('ldw'), 1])

    def _intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        vpadd = "llvm.arm.neon.vpadd.v8u8"
        vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
        args_1 = tvm.const(1, 'uint32')
        args_2 = tvm.const(2, 'uint32')

        def _instr(index):
            irb = tvm.ir_builder.create()
            if index == 1:
                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
                return irb.get()

            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload(
                                [bx, 0], 'uint8x16')
                            cnts = tvm.popcount(ands)
                            upper_half = tvm.call_pure_intrin(
                                'uint8x8', 'vectorhigh', cnts)
                            lower_half = tvm.call_pure_intrin(
                                'uint8x8', 'vectorlow', cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin('uint8x16',
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2,
                                                   zz.vload(0, 'uint16x8'),
                                                   shifted_cnts)
                    else:  # ki == 8
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload(
                                [bx, 0], 'uint8x8')
                            cnts8[i] = tvm.popcount(ands)
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin('uint8x16',
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2,
                                                   zz.vload(0, 'uint16x8'),
                                                   shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x: Xb})
def bitserial_dense(data,
                    weight,
                    data_bits,
                    weight_bits,
                    pack_dtype='uint32',
                    out_dtype='int16',
                    unipolar=True):
    """The default implementation of bitserial dense in topi.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim]
    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim] or
        3-D with shape [out_dim, weight_bits, in_dim]
    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim]
    """
    data_packed = bitpack(data,
                          data_bits,
                          pack_axis=1,
                          bit_axis=1,
                          pack_type=pack_dtype)
    if len(weight.shape) == 2:
        weight_packed = bitpack(weight,
                                weight_bits,
                                pack_axis=1,
                                bit_axis=1,
                                pack_type=pack_dtype)
    else:
        weight_packed = weight
    Y, DB, K = get_const_tuple(data_packed.shape)
    X, WB, _ = get_const_tuple(weight_packed.shape)

    oshape = (Y, X)
    k = tvm.reduce_axis((0, K), name='k')
    db = tvm.reduce_axis((0, DB), name='db')
    wb = tvm.reduce_axis((0, WB), name='wb')

    matmul_unipolar = tvm.compute(
        oshape,
        lambda i, j: tvm.sum((tvm.popcount(weight_packed[
            j, wb, k] & data_packed[i, db, k]) - tvm.popcount(~weight_packed[
                j, wb, k] & data_packed[i, db, k])).astype(out_dtype) <<
                             (db + wb).astype(out_dtype),
                             axis=[wb, db, k]),
        tag='bitserial_dense_unipolar')

    matmul = tvm.compute(
        oshape,
        lambda i, j: tvm.sum(tvm.popcount(weight_packed[
            j, wb, k] & data_packed[i, db, k]).astype(out_dtype) <<
                             (db + wb).astype(out_dtype),
                             axis=[wb, db, k]),
        tag='bitserial_dense')

    if unipolar:
        return matmul_unipolar
    return matmul
Beispiel #57
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
                   out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:

        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    ##### space definition begin #####
    tile_bna_candidates = [1, 2, 4, 8, 16]
    factors = get_factors(CO)
    cfg.define_knob('tile_bna',
                    [x for x in tile_bna_candidates if x in factors])
    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
    ##### space definition end #####

    if cfg.is_fallback:
        cfg['tile_bnb'].val = 4
        cfg['tile_bna'].val = 4
        while CO % cfg['tile_bna'].val != 0:
            cfg['tile_bna'].val //= 2
        cfg['yt'].val = 8
        cfg.fallback_split('tile_t1', [-1, 128])
        cfg.fallback_split('tile_t2', [-1, 128])
        cfg.fallback_split('c_unroll', [-1, 8])

    bna = cfg['tile_bna'].val
    bnb = cfg['tile_bnb'].val

    P_round = (P + bnb - 1) // bnb * bnb
    assert CO % bna == 0 and P_round % bnb == 0

    # pack input tile
    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
         tvm.if_then_else(
             b * bnb + bb < P,
             data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute(
            (alpha, alpha, CO // bna, CI, bna),
            lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][
                r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
                                                 axis=[r_kh, r_kw]),
            name='U')

    # transform image
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb),
                    lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][
                        r_b][vp] * B[r_a][eps] * B[r_b][nu],
                                                       axis=[r_a, r_b]),
                    name='V')

    # batch gemm
    ci = tvm.reduce_axis((0, CI), name='c')
    M = tvm.compute((alpha, alpha, CO, P_round),
                    lambda eps, nu, co, p: tvm.sum(U[eps][nu][co // bna][ci][
                        co % bna] * V[eps][nu][p // bnb][ci][p % bnb],
                                                   axis=ci),
                    name='M')

    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    Y = tvm.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: tvm.sum(
            M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='Y')

    # unpack output
    output = tvm.compute(
        (N, CO, H, W),
        lambda n, co, h, w: Y[co][n * nH * nW +
                                  (h // m) * nW + w // m][h % m][w % m]
        # The following hack term is used to make the padding in batch gemm ("M")
        # effective, otherwise the padding will be eliminated by bound inference.
        # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding.
        + tvm.expr.Mul(tvm.const(0, out_dtype), M[alpha - 1][alpha - 1][CO - 1]
                       [P_round - 1]),
        name='output',
        tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
    return output
Beispiel #58
0
def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        stride_h, stride_w = stride
    else:
        stride_h, stride_w = stride, stride

    out_channel = num_filter
    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
    oshape = (batch, out_channel, out_height, out_width)
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(data, pad_before, pad_after, name="pad_temp")

    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')

    block_w = 0
    block_h = 0
    if stride_h == 2:
        if num_filter + kernel_h == 515:
            conv_tag = "4_4"
            block_h = 4
            block_w = 4
        else:
            conv_tag = "4_5"
            block_h = 4
            block_w = 5
    elif kernel_h == 3:
        if num_filter == 512:
            conv_tag = "2_7"
            block_h = 2
            block_w = 7
        else:
            conv_tag = "2_14"
            block_h = 2
            block_w = 14
    else:
        conv_tag = "1_16"
        block_h = 1
        block_w = 16

    c_h = out_height
    c_w = out_width

    if not out_height % block_h == 0:
        c_h = (out_height // block_h + 1) * block_h

    if not out_width % block_w == 0:
        c_w = (out_width // block_w + 1) * block_w

    nv = 16
    cshape = (batch, out_channel // nv, c_h, c_w, nv)
    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)

    kernel_vec = tvm.compute(
        kvshape,
        lambda co, ci, kh, kw, vc:
        kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')

    conv = tvm.compute(
        cshape,
        lambda nn, ff, yy, xx, vc:\
          tvm.sum(
              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
              kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
              axis=[rc, ry, rx]), tag=conv_tag, name='conv')

    output = tvm.compute(
        oshape,
        lambda nn, ff, yy, xx:
        conv[nn][ff//nv][yy][xx][ff%nv],
        name='output_unpack', tag=conv_tag)

    return output
Beispiel #59
0
def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
    """Convolution operator in NHWC layout.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]

    Filter : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, num_filter]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]
    """
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_height, in_width, in_channel = Input.shape
    kernel_h, kernel_w, channel, num_filter = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_channel = num_filter
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
    Output = tvm.compute((batch, out_height, out_width, out_channel),
                         lambda nn, yy, xx, ff: tvm.sum(PaddedInput[
                             nn, yy * stride_h + ry * dilation_h, xx * stride_w
                             + rx * dilation_w, rc].astype(out_dtype) * Filter[
                                 ry, rx, rc, ff].astype(out_dtype),
                                                        axis=[ry, rx, rc]),
                         name="Conv2dOutput",
                         tag="conv2d_nhwc")
    return Output
Beispiel #60
0
def conv2d_transpose_nchw_cuda(cfg, Input, Filter, strides, padding,
                               out_dtype):
    """Transposed 2D convolution nchw forward operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]
    Filter : tvm.Tensor
        4-D with shape [in_channel, num_filter, filter_height, filter_width]
    strides : tuple of two ints
        The spatial stride along height and width
    padding : int or str
        Padding size, or ['VALID', 'SAME']
    out_dtype: str
        The output type. This is used in mixed precision

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, in_c, in_h, in_w = get_const_tuple(Input.shape)
    _, out_c, filter_h, filter_w = get_const_tuple(Filter.shape)
    stride_h, stride_w = strides

    # attach stride info to config, this is used in schedule space definition
    cfg.stride = strides

    # padding stage
    fpad_top, fpad_left, fpad_bottom, fpad_right = nn.get_pad_tuple(
        padding, (filter_h, filter_w))
    bpad_top = filter_h - 1 - fpad_top
    bpad_bottom = filter_h - 1 - fpad_bottom
    bpad_left = filter_w - 1 - fpad_left
    bpad_right = filter_w - 1 - fpad_right

    # padding stage
    FirstPad = nn.pad(Input, [
        0, 0, (bpad_top + stride_h - 1) // stride_h,
        (bpad_left + stride_w - 1) // stride_w
    ], [
        0, 0, (bpad_bottom + stride_h - 1) // stride_h,
        (bpad_right + stride_w - 1) // stride_w
    ],
                      name='FirstPad')

    # remove extra padding introduced by dilatation
    border_h = (stride_h - bpad_top % stride_h) % stride_h
    border_w = (stride_w - bpad_left % stride_w) % stride_w

    # dilation stage
    data = FirstPad
    strides = [1, 1, stride_h, stride_w]
    n = len(data.shape)

    def _dilate(*indices):
        not_zero = []
        index_tuple = []
        for i in range(n):
            if not equal_const_int(strides[i], 1):
                index_tuple.append(indices[i] // strides[i])
                not_zero.append((indices[i] % strides[i]).equal(0))
            else:
                index_tuple.append(indices[i])
        if not_zero:
            not_zero = tvm.all(*not_zero)
            return tvm.if_then_else(not_zero, data(*index_tuple),
                                    tvm.const(0.0, data.dtype))
        return data(*index_tuple)

    # convolution stage
    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
    dc = tvm.reduce_axis((0, in_c), name='dc')
    dh = tvm.reduce_axis((0, filter_h), name='dh')
    dw = tvm.reduce_axis((0, filter_w), name='dw')

    Output = tvm.compute(
        (batch, out_c, out_h, out_w),
        lambda b, c, h, w: tvm.sum(_dilate(
            b, dc, h + dh + border_h, w + dw + border_w).astype(
                out_dtype) * Filter[dc, c, filter_h - 1 - dh, filter_w - 1 - dw
                                    ].astype(out_dtype),
                                   axis=[dc, dh, dw]),
        tag="conv2d_transpose_nchw")

    return Output