Exemple #1
0
    def traverse(op):
        """Traverse operators from computation graph"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(op.tag):
            if op not in s.outputs:
                s[op].compute_inline()
            for tensor in op.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)

        if 'conv2d_nchw' in op.tag:
            # print('Run in x86-rasp schedule')
            output = op.output(0)
            conv_out = op.input_tensors[0]
            kernel_vec = conv_out.op.input_tensors[1]
            kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data = data_vec.op.input_tensors[0]
            data_pad = None
            if isinstance(data.op,
                          tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]

            padding = infer_pad(data, data_pad)
            if data_pad is None:
                stride = infer_stride(data, kernel, output)
            else:
                stride = infer_stride(data_pad, kernel, output)

            wkl = _get_workload(data, kernel, stride, padding, output.dtype)
            sch = _get_schedule(wkl)
            return _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec,
                                               kernel, kernel_vec, conv_out,
                                               output, outs[0])
Exemple #2
0
def get_workload(data, kernel, stride, padding, out_dtype):
    """ Get the workload structure. """
    CO, CI, ci, co, KH, KW = [x.value for x in kernel.shape]
    ori_kernel = tvm.placeholder((CO*co, CI*ci, KH, KW))
    n, _, h, w, _ = [x.value for x in data.shape]
    original_data = tvm.placeholder((n, CI * ci, h, w))
    return _get_workload(original_data, ori_kernel, stride, padding, out_dtype)
Exemple #3
0
def alter_conv2d_layout(attrs, inputs, tinfos):
    copy_inputs = [s for s in inputs]

    data = tinfos[0]
    kernel = tinfos[1]

    import ast
    padding = ast.literal_eval(attrs['padding'])
    stride = ast.literal_eval(attrs['strides'])

    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
    sch = _get_schedule_conv(wkl)
    is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd)
    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn

    new_attrs = {k: attrs[k] for k in attrs.keys()}
    new_attrs['layout'] = 'NCHW%dc' % ic_bn
    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn

    if is_kernel_1x1:
        # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w)
        new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn)
    else:
        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)

    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
Exemple #4
0
    def traverse(op):
        """Traverse operators from computation graph"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(op.tag):
            if op not in s.outputs:
                s[op].compute_inline()
            for tensor in op.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)

        if 'conv2d_nChwc' in op.tag:
            print('Got conv2d_nChwc tag: ' + str(op.tag))
            output = op.output(0)
            # conv_out = op.input_tensors[0]
            conv_out = output
            kernel = conv_out.op.input_tensors[1]
            # kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data = data_vec.op.input_tensors[0] \
                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and len(data_vec.op.input_tensors) > 0 and "pad" not in data_vec.op.tag \
                else data_vec
            data_pad = None
            if isinstance(data.op,
                          tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]

            n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
            ic = ic_chunk * ic_block
            original_data = tvm.placeholder((n, ic, h, w), dtype=output.dtype)

            if data_pad is not None:
                n, _, pad_h, pad_w, _ = [x.value for x in data_pad.shape]
                original_data_pad = tvm.placeholder((n, ic, pad_h, pad_w),
                                                    dtype=output.dtype)
                padding = infer_pad(original_data, original_data_pad)
            else:
                padding = (0, 0)

            oc, kh, kw = kernel_size
            original_kernel = tvm.placeholder((oc, ic, kh, kw),
                                              dtype=output.dtype)

            n, oc_chunk, oh, ow, oc_block = [x.value for x in output.shape]
            original_output = tvm.placeholder((n, oc_chunk * oc_block, oh, ow),
                                              dtype=output.dtype)

            if data_pad is None:
                stride = infer_stride(original_data, original_kernel,
                                      original_output)
            else:
                stride = infer_stride(original_data_pad, original_kernel,
                                      original_output)

            wkl = _get_workload(original_data, original_kernel, stride,
                                padding, output.dtype)
            sch = _get_schedule(wkl)
            _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel,
                                        conv_out, output, outs[0])
Exemple #5
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    assert layout == 'NCHW', "only support NCHW convolution on rasp"
    assert data.shape[
        0].value == 1, "only support batch size=1 convolution on rasp"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)
    return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout,
                                        out_dtype)
Exemple #6
0
def _declaration_conv(data, kernel, kernel_size, stride, padding, layout, out_dtype):
    assert layout == 'NCHW', "only support NCHW convolution on avx"
    assert data.shape[0].value == 1, "only support batch size=1 convolution on avx"
    _, ic, _, _ = [x.value for x in data.shape]
    oc, kh, kw = kernel_size
    wkl = _get_workload(data, tvm.placeholder((oc, ic, kh, kw)), stride, padding, out_dtype)
    sch = _get_schedule(wkl)
    return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype)
Exemple #7
0
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    assert layout == 'NCHW', "only support NCHW convolution on rasp"
    assert data.shape[
        0].value == 1, "only support batch size=1 convolution on rasp"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # input: c, h, w
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data
    shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width,
             sch.ic_bn)
    data_vec = tvm.compute(
        shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w])

    shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn,
             sch.oc_bn, 1, 1)
    kernel_pack = tvm.compute(
        shape, lambda CO, CI, ci, co, h, w: kernel[CO * sch.oc_bn + co, CI *
                                                   sch.ic_bn + ci, h, w])

    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
            n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype(
                out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, ic % sch.
                                         ic_bn, oc_block, 0, 0],
                                                      axis=[ic]),
        name='conv')

    oshape = (batch_size, num_filter, out_height, out_width)
    unpack = tvm.compute(
        oshape,
        lambda n, oc, oh, ow: conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn],
        tag='conv2d_nchw')
    return unpack
    def check_device():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')

        out_dtype = 'float32'

        wkl = _get_workload(A, W, stride, padding, out_dtype)

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)

        dtype = A.dtype

        @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()
        device = 'llvm -mcpu=skylake-avx512'
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)

        with tvm.build_config(auto_unroll_max_step=1400,
                              unroll_explicit=(device != "cuda")):
            A_vec, s = _spatial_pack_data_only(wkl, sch, A)
            a_vec_shape = get_const_tuple(A_vec.shape)
            a_vec = tvm.nd.array(np.zeros(a_vec_shape, dtype=dtype), ctx)
            func = tvm.build(s, [A, A_vec], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=20)
            cost_data = time_f(a, a_vec).mean

            W_vec, s = _spatial_pack_kernel_only(wkl, sch, W)
            w_vec_shape = get_const_tuple(W_vec.shape)
            w_vec = tvm.nd.array(np.zeros(w_vec_shape, dtype=dtype), ctx)
            func = tvm.build(s, [W, W_vec], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=20)
            cost_kernel = time_f(w, w_vec).mean

            A_vec = tvm.placeholder(a_vec_shape, name='A_vec')
            W_vec = tvm.placeholder(w_vec_shape, name='W_vec')
            B, s = _spatial_conv_only(wkl, sch, A_vec, W_vec, out_dtype=dtype)
            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
            func = tvm.build(s, [A_vec, W_vec, B], target=device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=20)
            cost_conv = time_f(a_vec, w_vec, b).mean

            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            return (cost_data, cost_kernel, cost_conv)
Exemple #9
0
def _declaration_conv(data, kernel, num_filter, kernel_size, stride, padding,
                      out_dtype):
    assert data.shape[
        0].value == 1, "only support batch size=1 convolution on avx"
    n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
    ic = ic_chunk * ic_block
    oc = num_filter
    kh, kw = kernel_size
    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype),
                        tvm.placeholder((oc, ic, kh, kw), dtype=out_dtype),
                        stride, padding, out_dtype)
    sch = _get_schedule(wkl)
    return _SCH_TO_DECL_FUNC[type(sch)](wkl, data, kernel)
Exemple #10
0
    def traverse(op):
        """Traverse operators from computation graph"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(op.tag):
            if op not in s.outputs:
                s[op].compute_inline()
            for tensor in op.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)

        if 'conv2d_nChwc' in op.tag:
            output = op.output(0)
            # conv_out = op.input_tensors[0]
            conv_out = op.input_tensors[0] if 'conv2d_nChwc_unpack' in op.tag else output
            kernel = conv_out.op.input_tensors[1]
            # kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data = data_vec.op.input_tensors[0] \
                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and len(data_vec.op.input_tensors) > 0 and "pad" not in data_vec.op.tag \
                else data_vec
            data_pad = None

            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]

            ndim_input = len(data.shape)
            if ndim_input == 5:
                n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
                ic = ic_chunk * ic_block
            else:
                n, ic, h, w = [x.value for x in data.shape]
            original_data = tvm.placeholder((n, ic, h, w), dtype=output.dtype)

            oc = num_filter
            kh, kw = kernel_size
            original_kernel = tvm.placeholder((oc, ic, kh, kw), dtype=output.dtype)

            wkl = _get_workload(original_data, original_kernel, stride, padding, output.dtype)
            sch = _get_schedule(wkl)
            _SCH_TO_SCH_FUNC[type(sch)](s, wkl, data, data_pad, data_vec,
                                        kernel, conv_out, output, outs[0])
Exemple #11
0
    def traverse(op):
        """Traverse operators from computation graph"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(op.tag):
            if op not in s.outputs:
                s[op].compute_inline()
            for tensor in op.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)

        if 'conv2d_nchw' in op.tag:
            output = op.output(0)
            conv_out = op.input_tensors[0]
            kernel = conv_out.op.input_tensors[1]
            # kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data = data_vec.op.input_tensors[0]
            data_pad = None
            if isinstance(data.op,
                          tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]
            padding = infer_pad(data, data_pad)

            _, ic, _, _ = [x.value for x in data.shape]
            oc = num_filter
            kh, kw = kernel_size
            original_kernel = tvm.placeholder((oc, ic, kh, kw))

            if data_pad is None:
                stride = infer_stride(data, original_kernel, output)
            else:
                stride = infer_stride(data_pad, original_kernel, output)

            wkl = _get_workload(data, original_kernel, stride, padding,
                                output.dtype)
            sch = _get_schedule(wkl)
            _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel,
                                        conv_out, output, outs[0])
Exemple #12
0
def weight_prepack_conv2d(attrs, inputs, tinfos):
    import ast
    data = tinfos[0]
    kernel = tinfos[1]
    padding = ast.literal_eval(attrs['padding'])
    stride = ast.literal_eval(attrs['strides'])
    wkl = _get_workload(data, kernel, stride, padding, 'float32')
    sch = _get_schedule_conv(wkl)
    is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd)

    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn

    new_attrs = {k: attrs[k] for k in attrs.keys()}
    new_attrs.pop('layout', None)

    kernel_sym = inputs[1]
    oc, ic, h, w = get_const_tuple(tinfos[1].shape)
    OC = oc // oc_bn
    IC = ic // ic_bn
    trans_kernel = sym.transpose(kernel_sym, axes=(1, 2, 3, 0))
    trans_kernel = sym.reshape(trans_kernel, shape=(ic, h, w, OC, oc_bn))
    trans_kernel = sym.transpose(trans_kernel, axes=(1, 2, 3, 4, 0))
    trans_kernel = sym.reshape(trans_kernel,
                               shape=(h, w, OC, oc_bn, IC, ic_bn))
    if is_kernel_1x1:
        # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w)
        trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 5, 3, 0, 1))
    else:
        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
        trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 0, 1, 5, 3))

    if attrs.get_bool('use_bias'):
        bias = inputs[2]
        bias = sym.reshape(bias, shape=(OC, oc_bn))
        return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel,
                                                     bias, **new_attrs)
    else:
        return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel,
                                                     **new_attrs)
Exemple #13
0
def weight_prepack_conv2d(attrs, inputs, tinfos):
    import ast
    print(attrs)
    data_sym = inputs[0]
    data = tinfos[0]
    kernel = tinfos[1]
    padding = ast.literal_eval(attrs['padding'])
    stride = ast.literal_eval(attrs['strides'])
    wkl = _get_workload(data, kernel, stride, padding, 'float32')
    sch = _get_schedule_conv(wkl)
    print(sch)
    is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd)

    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn
    # TODO: hack checking input layer
    if ic_bn == 3:
        data_sym = sym.expand_dims(data_sym, axis=4)

    new_attrs = {k: attrs[k] for k in attrs.keys()}
    new_attrs['layout'] = 'NCHWc'
    new_attrs['ic_bn'] = ic_bn
    new_attrs['oc_bn'] = oc_bn

    kernel_sym = inputs[1]
    reorder_attrs = {
        'ic_bn': ic_bn,
        'oc_bn': oc_bn,
        'kernel_1x1': is_kernel_1x1
    }
    trans_kernel = sym.reorder(kernel_sym, **reorder_attrs)

    if attrs.get_bool('use_bias'):
        bias = inputs[2]
        bias = sym.bn_reorder(bias, bn=oc_bn)
        print('!!!!!!!!!!conv2d_nopack')
        return sym.conv2d_nopack(data_sym, trans_kernel, bias, **new_attrs)
    else:
        return sym.conv2d_nopack(data_sym, trans_kernel, **new_attrs)
Exemple #14
0
def weight_prepack_conv2d(attrs, inputs, tinfos):
    import ast
    print(attrs)
    data = tinfos[0]
    kernel = tinfos[1]
    padding = ast.literal_eval(attrs['padding'])
    stride = ast.literal_eval(attrs['strides'])
    wkl = _get_workload(data, kernel, stride, padding, 'float32')
    sch = _get_schedule_conv(wkl)
    print(sch)
    is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd)

    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn
    # new_attrs = {k : attrs[k] for k in attrs.keys()}
    # new_attrs['original_kernel_shape'] = str(kernel.shape)

    kernel_sym = inputs[1]
    reorder_attrs = {'ic_bn' : ic_bn, 'oc_bn' : oc_bn, 'kernel_1x1' : is_kernel_1x1}
    trans_kernel = sym.reorder(kernel_sym, **reorder_attrs)

    if attrs.get_bool('use_bias'):
        return sym.conv2d_prepack(inputs[0], trans_kernel, inputs[2], **attrs)
    else:
        return sym.conv2d_prepack(inputs[0], trans_kernel, **attrs)
Exemple #15
0
def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec, kernel,
                            kernel_vec, conv_out, output, last):
    # no stride and padding info here
    padding = infer_pad(data, data_pad)
    if data_pad is None:
        stride = infer_stride(data, kernel, output)
    else:
        stride = infer_stride(data_pad, kernel, output)
    wkl = _get_workload(data, kernel, stride, padding, output.dtype)

    sch = _schedule_conv2d(wkl)

    H, W = wkl.height, wkl.width
    CI = wkl.in_filter
    CO = wkl.out_filter
    HK, WK = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    HCAT, WCAT = HK - 1, WK - 1
    DOPAD = (HPAD != 0 and WPAD != 0)

    P = sch.vp
    Q = sch.vq
    UNROLL = sch.unroll

    A, B, C = data, kernel, last
    A0, A1, A2 = data_pad, data_col, data_vec
    B0 = kernel_vec
    C0, C1 = conv_out, output

    CC = s.cache_write(C0, "global")
    AA = s.cache_read(A2, "global", [CC])
    BB = s.cache_read(B0, "global", [CC])

    ##### Schedule CC
    _, co, im, vim, vco = s[C0].op.axis
    s[C0].unroll(vim)
    s[C0].vectorize(vco)

    s[CC].compute_at(s[C0], im)
    _, co, im, vim, vco = s[CC].op.axis
    ci, hk, wk = s[CC].op.reduce_axis
    s[CC].reorder(ci, hk, wk, vim, vco)
    s[CC].unroll(vim)
    s[CC].vectorize(vco)
    # s[CC].unroll(ccr)

    ### Schedule C
    _, co, h, w = s[C].op.axis
    im = s[C].fuse(h, w)
    im, vim = s[C].split(im, P)
    co, vco = s[C].split(co, Q)
    s[C].reorder(co, im, vim, vco)

    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[C].split(co, sch.bc)
        oaxis = oco
        paxis = ico

    s[C].parallel(paxis)
    s[C].pragma(oaxis, "parallel_launch_point")
    s[C].pragma(paxis, "parallel_stride_pattern")
    s[C].pragma(oaxis, "parallel_barrier_when_finish")
    if C1 != C:
        s[C1].compute_inline()

    s[C0].compute_at(s[C], paxis)

    ##### Schedule A
    if DOPAD:
        s[A0].compute_inline()
    s[A1].compute_inline()
    s[AA].compute_at(s[CC], wk)
    s[AA].unroll(AA.op.axis[4])

    _, im, _, _, _, _ = s[A2].op.axis
    if sch.ba == 1:
        oaxis = im
        paxis = im
    else:
        oim, iim = s[A2].split(im, sch.ba)
        oaxis = oim
        paxis = iim

    s[A2].parallel(paxis)
    s[A2].pragma(oaxis, "parallel_launch_point")
    s[A2].pragma(paxis, "parallel_stride_pattern")
    s[A2].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule B
    s[BB].compute_at(s[CC], wk)
    s[BB].vectorize(BB.op.axis[4])

    co, _, _, _, _ = s[B0].op.axis
    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[B0].split(co, sch.bc)
        oaxis = oco
        paxis = ico

    s[B0].parallel(paxis)
    s[B0].pragma(oaxis, "parallel_launch_point")
    s[B0].pragma(paxis, "parallel_stride_pattern")
    s[B0].pragma(oaxis, "parallel_barrier_when_finish")

    return s
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
    # print('Run in avx512_conv_common decl')
    assert layout == 'NCHW', "only support NCHW convolution on rasp"
    assert data.shape[
        0].value == 1, "only support batch size=1 convolution on rasp"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    if len(kernel.shape) == 4:
        num_filter, _, kernel_height, kernel_width = get_const_tuple(
            kernel.shape)
    else:
        num_filter, _, kernel_height, kernel_width, ic, oc = get_const_tuple(
            kernel.shape)
        num_filter *= oc

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1

    # pack data
    # input: c, h, w
    shape = (batch_size, in_channel, pad_height, pad_width)
    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data
    # data_pad = tvm.compute(shape, lambda n, c, h, w: tvm.select(
    #     tvm.all(h >= HPAD, h < pad_height - HPAD, w >= WPAD, w < pad_width - WPAD),
    #     data[n, c, h - HPAD, w - WPAD], 0.0
    # ), name='data_pad')

    shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn,
             pad_width)
    data_vec = tvm.compute(
        shape,
        lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w],
        name='data_vec')

    # pack kernel
    # input: co, ci, h, w
    # output: gOIhw16i16o
    if False:
        shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn,
                 kernel_height, kernel_width, sch.ic_bn, sch.oc_bn)
        kernel_pack = tvm.compute(
            shape,
            lambda CO, CI, h, w, ci, co: kernel[CO * sch.oc_bn + co, CI * sch.
                                                ic_bn + ci, h, w],
            name='kernel_pack')
    else:
        kernel_pack = kernel

    # convolution
    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width,
              sch.oc_bn)
    ovshape = (batch_size, num_filter // sch.oc_bn, out_height, sch.oc_bn,
               out_width)
    unpack_shape = (batch_size, num_filter, out_height, out_width)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
            n, ic // sch.ic_bn, oh * HSTR + kh, ic % sch.ic_bn, ow * WSTR + kw
        ].astype(out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, kh, kw, ic
                                          % sch.ic_bn, oc_block],
                                                      axis=[ic, kh, kw]),
        name='conv')

    unpack = tvm.compute(
        unpack_shape,
        lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn],
        name='output_unpack',
        tag='conv2d_nchw')
    return unpack
def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_pack, conv_out,
                   output, last):
    # print('Run in avx512_conv_common sch')
    # no stride and padding info here
    """
    C, O0, O = conv_out, output, last
    batch, oc, oh, ow = s[O].op.axis
    s[O].parallel(batch)
    return s
    """

    padding = infer_pad(data, data_pad)
    if data_pad is None:
        stride = infer_stride(data, kernel, output)
    else:
        stride = infer_stride(data_pad, kernel, output)
    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
    sch = _get_schedule(wkl)

    HPAD, WPAD = wkl.hpad, wkl.wpad
    DOPAD = (HPAD != 0 and WPAD != 0)

    A, W = data, kernel_pack
    A0, A1 = data_pad, data_vec
    # schedule data
    if DOPAD:
        s[A0].compute_inline()
    batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
    parallel_axis = s[A1].fuse(ic_chunk, ih)
    s[A1].parallel(parallel_axis)
    s[A1].pragma(batch, "parallel_launch_point")
    s[A1].pragma(parallel_axis, "parallel_stride_pattern")
    s[A1].pragma(batch, "parallel_barrier_when_finish")

    # schedule kernel pack
    if False:
        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
        s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
        if sch.oc_bn > 1:
            s[W].vectorize(oc_block)
        parallel_axis = s[W].fuse(oc_chunk, oh)
        s[W].parallel(parallel_axis)
        s[W].pragma(parallel_axis, "parallel_launch_point")
        s[W].pragma(parallel_axis, "parallel_stride_pattern")
        s[W].pragma(parallel_axis, "parallel_barrier_when_finish")

    # schedule conv
    C, O0, O = conv_out, output, last
    CC = s.cache_write(C, 'global')

    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
    ow_chunk, ow_block = s[C].split(ow, factor=sch.ur_w)
    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
    s[C].fuse(oc_chunk, oh)
    s[C].vectorize(oc_block)

    s[CC].compute_at(s[C], ow_chunk)
    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
    ic, kh, kw = s[CC].op.reduce_axis

    ow_chunk, ow_block = s[CC].split(ow, factor=sch.ur_w)
    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)

    if sch.unroll_kw:
        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw,
                      ow_block, oc_block)
        s[CC].unroll(kw)
    else:
        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block,
                      ow_block, oc_block)

    s[CC].fuse(oc_chunk, oh)
    s[CC].vectorize(oc_block)

    s[CC].unroll(ow_block)

    if O0 != O:
        s[O0].compute_inline()

    batch, oc, oh, ow = s[O].op.axis
    ow_chunk, ow_block = s[O].split(ow, factor=sch.ur_w)
    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
    s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
    parallel_axis = s[O].fuse(oc_chunk, oh)
    s[C].compute_at(s[O], parallel_axis)
    s[O].vectorize(oc_block)

    s[O].parallel(parallel_axis)
    s[O].pragma(batch, "parallel_launch_point")
    s[O].pragma(parallel_axis, "parallel_stride_pattern")
    s[O].pragma(batch, "parallel_barrier_when_finish")

    return s
Exemple #18
0
def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_pack, conv_out,
                   output, last):
    # print('Run in avx512_conv_1x1 sch')
    # no stride and padding info here
    padding = infer_pad(data, data_pad)
    if data_pad is None:
        stride = infer_stride(data, kernel, output)
    else:
        stride = infer_stride(data_pad, kernel, output)

    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
    sch = _get_schedule(wkl)

    A, W = data, kernel_pack
    A0, A1 = data_pad, data_vec
    # schedule data
    if A0 is not None:
        s[A0].compute_inline()
    batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
    parallel_axis = s[A1].fuse(ic_chunk, ih)
    s[A1].parallel(parallel_axis)
    s[A1].pragma(batch, "parallel_launch_point")
    s[A1].pragma(parallel_axis, "parallel_stride_pattern")
    s[A1].pragma(batch, "parallel_barrier_when_finish")

    # schedule kernel pack
    oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
    if sch.oc_bn > 1:
        s[W].vectorize(oc_block)
    parallel_axis = s[W].fuse(oc_chunk, oh)
    s[W].parallel(parallel_axis)
    s[W].pragma(parallel_axis, "parallel_launch_point")
    s[W].pragma(parallel_axis, "parallel_stride_pattern")
    s[W].pragma(parallel_axis, "parallel_barrier_when_finish")

    C, O0, O = conv_out, output, last
    CC = s.cache_write(C, 'global')

    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
    s[C].vectorize(oc_block)

    s[CC].compute_at(s[C], oh_outer)
    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
    ic, = s[CC].op.reduce_axis

    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)

    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)

    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner,
                  ow_inner, oc_block)
    s[CC].vectorize(oc_block)

    s[CC].unroll(ow_inner)
    s[CC].unroll(oh_inner)

    if O0 != O:
        s[O0].compute_inline()
    batch, oc, oh, ow = s[O].op.axis

    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
    oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
    ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
    s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)

    parallel_axis = s[O].fuse(oc_chunk, oh_outer)
    s[C].compute_at(s[O], parallel_axis)
    s[O].vectorize(oc_block)

    s[O].parallel(parallel_axis)
    s[O].pragma(batch, "parallel_launch_point")
    s[O].pragma(parallel_axis, "parallel_stride_pattern")
    s[O].pragma(batch, "parallel_barrier_when_finish")

    return s
Exemple #19
0
def _spatial_get_sch(data, kernel, stride, padding, out_dtype):
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _schedule_conv2d(wkl)
    return (wkl, sch)
Exemple #20
0
    def check_device():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel_size, kernel_size),
                            name='W')

        out_dtype = 'float32'
        wkl = _get_workload(A, W, stride, padding, out_dtype)
        sch = Im2ColPack(7, 8, 1, 8, True)

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)

        dtype = A.dtype

        @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()
        # device = 'llvm'
        device = 'llvm -mcpu=skylake-avx512'
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)

        with tvm.build_config(auto_unroll_max_step=1400,
                              unroll_explicit=(device != "cuda")):

            B = _im2col_pack(wkl, sch, A, W, stride, padding, out_dtype)
            s = tvm.create_schedule(B.op)
            traverse(s, B.op)

            op = B.op
            output = op.output(0)
            conv_out = op.input_tensors[0]
            kernel_vec = conv_out.op.input_tensors[1]
            kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data_col = data_vec.op.input_tensors[0]
            data = data_col.op.input_tensors[0]
            data_pad = None
            if isinstance(data.op,
                          tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]
            _schedule_im2col_conv2d(wkl, sch, s, data, data_pad, data_col,
                                    data_vec, kernel, kernel_vec, conv_out,
                                    output, B)

            print(tvm.lower(s, [A, W, B], simple_mode=True))

            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                             ctx)
            func = tvm.build(s, [A, W, B], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a, w, b).mean
            print('conv: %g secs/op' % cost)

            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            print(b_np.shape)
Exemple #21
0
def _spatial_get_sch(data, kernel, stride, padding, out_dtype):
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    return _get_workload(data, kernel, stride, padding, out_dtype)
def get_workload(data, kernel, stride, padding, out_dtype):
    """ Get the workload structure. """
    CO, CI, KH, KW, ci, co = [x.value for x in kernel.shape]
    ori_kernel = tvm.placeholder((CO * co, CI * ci, KH, KW))
    return _get_workload(data, ori_kernel, stride, padding, out_dtype)
Exemple #23
0
def _schedule_spatial_conv2d(s, data, data_pad, data_vec, kernel, kernel_vec,
                             conv_out, output, last):
    # no stride and padding info here
    padding = infer_pad(data, data_pad)
    if data_pad is None:
        stride = infer_stride(data, kernel, output)
    else:
        stride = infer_stride(data_pad, kernel, output)
    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
    sch = _get_schedule(wkl)

    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    HK, WK = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    HCAT, WCAT = HK - 1, WK - 1
    DOPAD = (HPAD != 0 and WPAD != 0)

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    A, B, C = data, kernel, last
    A0, A1 = data_pad, data_vec
    B0 = kernel_vec
    C0, C1 = conv_out, output

    CC = s.cache_write(C0, "global")

    _, co, oh, ow, vh, vw, vc = s[C0].op.axis
    if UNROLL:
        s[C0].unroll(vw)
    s[C0].vectorize(vc)

    s[CC].compute_at(s[C0], ow)
    _, co, oh, ow, vh, vw, vc = s[CC].op.axis
    ci, dh, dw = s[CC].op.reduce_axis
    s[CC].reorder(ci, dh, vh, dw, vw, vc)

    if UNROLL:
        s[CC].unroll(vw)
    s[CC].vectorize(vc)

    ##### Schedule A
    if DOPAD:
        s[A0].compute_inline()

    _, h, _, _, _, _ = s[A1].op.axis
    if sch.ba == 1:
        oaxis = h
        paxis = h
    else:
        oh, ih = s[A1].split(h, sch.ba)
        oaxis = oh
        paxis = ih

    s[A1].parallel(paxis)
    s[A1].pragma(oaxis, "parallel_launch_point")
    s[A1].pragma(paxis, "parallel_stride_pattern")
    s[A1].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule B
    co, _, _, _, _ = s[B0].op.axis
    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[B0].split(co, sch.bc)
        oaxis = oco
        paxis = ico

    s[B0].parallel(paxis)
    s[B0].pragma(oaxis, "parallel_launch_point")
    s[B0].pragma(paxis, "parallel_stride_pattern")
    s[B0].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule C
    n, co, h, w = s[C].op.axis
    co, vc = s[C].split(co, VC)
    oh, ow, vh, vw = s[C].tile(h, w, VH, VW)
    s[C].reorder(n, co, oh, ow, vh, vw, vc)
    if C != C1:
        s[C1].compute_inline()
    s[C0].compute_at(s[C], ow)

    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[C].split(co, sch.bc)
        oaxis = oco
        paxis = ico

    s[C].parallel(paxis)
    s[C].pragma(oaxis, "parallel_launch_point")
    s[C].pragma(paxis, "parallel_stride_pattern")
    s[C].pragma(oaxis, "parallel_barrier_when_finish")

    return s