def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: # print('Run in x86-rasp schedule') output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) return _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, outs[0])
def get_workload(data, kernel, stride, padding, out_dtype): """ Get the workload structure. """ CO, CI, ci, co, KH, KW = [x.value for x in kernel.shape] ori_kernel = tvm.placeholder((CO*co, CI*ci, KH, KW)) n, _, h, w, _ = [x.value for x in data.shape] original_data = tvm.placeholder((n, CI * ci, h, w)) return _get_workload(original_data, ori_kernel, stride, padding, out_dtype)
def alter_conv2d_layout(attrs, inputs, tinfos): copy_inputs = [s for s in inputs] data = tinfos[0] kernel = tinfos[1] import ast padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, data.dtype) sch = _get_schedule_conv(wkl) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn new_attrs = {k: attrs[k] for k in attrs.keys()} new_attrs['layout'] = 'NCHW%dc' % ic_bn new_attrs['out_layout'] = 'NCHW%dc' % oc_bn if is_kernel_1x1: # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn) else: # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nChwc' in op.tag: print('Got conv2d_nChwc tag: ' + str(op.tag)) output = op.output(0) # conv_out = op.input_tensors[0] conv_out = output kernel = conv_out.op.input_tensors[1] # kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] \ if isinstance(data_vec.op, tvm.tensor.ComputeOp) and len(data_vec.op.input_tensors) > 0 and "pad" not in data_vec.op.tag \ else data_vec data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block original_data = tvm.placeholder((n, ic, h, w), dtype=output.dtype) if data_pad is not None: n, _, pad_h, pad_w, _ = [x.value for x in data_pad.shape] original_data_pad = tvm.placeholder((n, ic, pad_h, pad_w), dtype=output.dtype) padding = infer_pad(original_data, original_data_pad) else: padding = (0, 0) oc, kh, kw = kernel_size original_kernel = tvm.placeholder((oc, ic, kh, kw), dtype=output.dtype) n, oc_chunk, oh, ow, oc_block = [x.value for x in output.shape] original_output = tvm.placeholder((n, oc_chunk * oc_block, oh, ow), dtype=output.dtype) if data_pad is None: stride = infer_stride(original_data, original_kernel, original_output) else: stride = infer_stride(original_data_pad, original_kernel, original_output) wkl = _get_workload(original_data, original_kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, conv_out, output, outs[0])
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHW', "only support NCHW convolution on rasp" assert data.shape[ 0].value == 1, "only support batch size=1 convolution on rasp" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype)
def _declaration_conv(data, kernel, kernel_size, stride, padding, layout, out_dtype): assert layout == 'NCHW', "only support NCHW convolution on avx" assert data.shape[0].value == 1, "only support batch size=1 convolution on avx" _, ic, _, _ = [x.value for x in data.shape] oc, kh, kw = kernel_size wkl = _get_workload(data, tvm.placeholder((oc, ic, kh, kw)), stride, padding, out_dtype) sch = _get_schedule(wkl) return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype)
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHW', "only support NCHW convolution on rasp" assert data.shape[ 0].value == 1, "only support batch size=1 convolution on rasp" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # input: c, h, w DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) data_vec = tvm.compute( shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w]) shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1) kernel_pack = tvm.compute( shape, lambda CO, CI, ci, co, h, w: kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w]) oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ic = tvm.reduce_axis((0, in_channel), name='ic') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype( out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, ic % sch. ic_bn, oc_block, 0, 0], axis=[ic]), name='conv') oshape = (batch_size, num_filter, out_height, out_width) unpack = tvm.compute( oshape, lambda n, oc, oh, ow: conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn], tag='conv2d_nchw') return unpack
def check_device(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') out_dtype = 'float32' wkl = _get_workload(A, W, stride, padding, out_dtype) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() device = 'llvm -mcpu=skylake-avx512' ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): A_vec, s = _spatial_pack_data_only(wkl, sch, A) a_vec_shape = get_const_tuple(A_vec.shape) a_vec = tvm.nd.array(np.zeros(a_vec_shape, dtype=dtype), ctx) func = tvm.build(s, [A, A_vec], device) time_f = func.time_evaluator(func.entry_name, ctx, number=20) cost_data = time_f(a, a_vec).mean W_vec, s = _spatial_pack_kernel_only(wkl, sch, W) w_vec_shape = get_const_tuple(W_vec.shape) w_vec = tvm.nd.array(np.zeros(w_vec_shape, dtype=dtype), ctx) func = tvm.build(s, [W, W_vec], device) time_f = func.time_evaluator(func.entry_name, ctx, number=20) cost_kernel = time_f(w, w_vec).mean A_vec = tvm.placeholder(a_vec_shape, name='A_vec') W_vec = tvm.placeholder(w_vec_shape, name='W_vec') B, s = _spatial_conv_only(wkl, sch, A_vec, W_vec, out_dtype=dtype) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A_vec, W_vec, B], target=device) time_f = func.time_evaluator(func.entry_name, ctx, number=20) cost_conv = time_f(a_vec, w_vec, b).mean np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) return (cost_data, cost_kernel, cost_conv)
def _declaration_conv(data, kernel, num_filter, kernel_size, stride, padding, out_dtype): assert data.shape[ 0].value == 1, "only support batch size=1 convolution on avx" n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block oc = num_filter kh, kw = kernel_size wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype), tvm.placeholder((oc, ic, kh, kw), dtype=out_dtype), stride, padding, out_dtype) sch = _get_schedule(wkl) return _SCH_TO_DECL_FUNC[type(sch)](wkl, data, kernel)
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nChwc' in op.tag: output = op.output(0) # conv_out = op.input_tensors[0] conv_out = op.input_tensors[0] if 'conv2d_nChwc_unpack' in op.tag else output kernel = conv_out.op.input_tensors[1] # kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] \ if isinstance(data_vec.op, tvm.tensor.ComputeOp) and len(data_vec.op.input_tensors) > 0 and "pad" not in data_vec.op.tag \ else data_vec data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] ndim_input = len(data.shape) if ndim_input == 5: n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block else: n, ic, h, w = [x.value for x in data.shape] original_data = tvm.placeholder((n, ic, h, w), dtype=output.dtype) oc = num_filter kh, kw = kernel_size original_kernel = tvm.placeholder((oc, ic, kh, kw), dtype=output.dtype) wkl = _get_workload(original_data, original_kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _SCH_TO_SCH_FUNC[type(sch)](s, wkl, data, data_pad, data_vec, kernel, conv_out, output, outs[0])
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: output = op.output(0) conv_out = op.input_tensors[0] kernel = conv_out.op.input_tensors[1] # kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] padding = infer_pad(data, data_pad) _, ic, _, _ = [x.value for x in data.shape] oc = num_filter kh, kw = kernel_size original_kernel = tvm.placeholder((oc, ic, kh, kw)) if data_pad is None: stride = infer_stride(data, original_kernel, output) else: stride = infer_stride(data_pad, original_kernel, output) wkl = _get_workload(data, original_kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, conv_out, output, outs[0])
def weight_prepack_conv2d(attrs, inputs, tinfos): import ast data = tinfos[0] kernel = tinfos[1] padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, 'float32') sch = _get_schedule_conv(wkl) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn new_attrs = {k: attrs[k] for k in attrs.keys()} new_attrs.pop('layout', None) kernel_sym = inputs[1] oc, ic, h, w = get_const_tuple(tinfos[1].shape) OC = oc // oc_bn IC = ic // ic_bn trans_kernel = sym.transpose(kernel_sym, axes=(1, 2, 3, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(ic, h, w, OC, oc_bn)) trans_kernel = sym.transpose(trans_kernel, axes=(1, 2, 3, 4, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(h, w, OC, oc_bn, IC, ic_bn)) if is_kernel_1x1: # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 5, 3, 0, 1)) else: # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 0, 1, 5, 3)) if attrs.get_bool('use_bias'): bias = inputs[2] bias = sym.reshape(bias, shape=(OC, oc_bn)) return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, bias, **new_attrs) else: return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, **new_attrs)
def weight_prepack_conv2d(attrs, inputs, tinfos): import ast print(attrs) data_sym = inputs[0] data = tinfos[0] kernel = tinfos[1] padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, 'float32') sch = _get_schedule_conv(wkl) print(sch) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn # TODO: hack checking input layer if ic_bn == 3: data_sym = sym.expand_dims(data_sym, axis=4) new_attrs = {k: attrs[k] for k in attrs.keys()} new_attrs['layout'] = 'NCHWc' new_attrs['ic_bn'] = ic_bn new_attrs['oc_bn'] = oc_bn kernel_sym = inputs[1] reorder_attrs = { 'ic_bn': ic_bn, 'oc_bn': oc_bn, 'kernel_1x1': is_kernel_1x1 } trans_kernel = sym.reorder(kernel_sym, **reorder_attrs) if attrs.get_bool('use_bias'): bias = inputs[2] bias = sym.bn_reorder(bias, bn=oc_bn) print('!!!!!!!!!!conv2d_nopack') return sym.conv2d_nopack(data_sym, trans_kernel, bias, **new_attrs) else: return sym.conv2d_nopack(data_sym, trans_kernel, **new_attrs)
def weight_prepack_conv2d(attrs, inputs, tinfos): import ast print(attrs) data = tinfos[0] kernel = tinfos[1] padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, 'float32') sch = _get_schedule_conv(wkl) print(sch) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn # new_attrs = {k : attrs[k] for k in attrs.keys()} # new_attrs['original_kernel_shape'] = str(kernel.shape) kernel_sym = inputs[1] reorder_attrs = {'ic_bn' : ic_bn, 'oc_bn' : oc_bn, 'kernel_1x1' : is_kernel_1x1} trans_kernel = sym.reorder(kernel_sym, **reorder_attrs) if attrs.get_bool('use_bias'): return sym.conv2d_prepack(inputs[0], trans_kernel, inputs[2], **attrs) else: return sym.conv2d_prepack(inputs[0], trans_kernel, **attrs)
def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec, kernel, kernel_vec, conv_out, output, last): # no stride and padding info here padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _schedule_conv2d(wkl) H, W = wkl.height, wkl.width CI = wkl.in_filter CO = wkl.out_filter HK, WK = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = HK - 1, WK - 1 DOPAD = (HPAD != 0 and WPAD != 0) P = sch.vp Q = sch.vq UNROLL = sch.unroll A, B, C = data, kernel, last A0, A1, A2 = data_pad, data_col, data_vec B0 = kernel_vec C0, C1 = conv_out, output CC = s.cache_write(C0, "global") AA = s.cache_read(A2, "global", [CC]) BB = s.cache_read(B0, "global", [CC]) ##### Schedule CC _, co, im, vim, vco = s[C0].op.axis s[C0].unroll(vim) s[C0].vectorize(vco) s[CC].compute_at(s[C0], im) _, co, im, vim, vco = s[CC].op.axis ci, hk, wk = s[CC].op.reduce_axis s[CC].reorder(ci, hk, wk, vim, vco) s[CC].unroll(vim) s[CC].vectorize(vco) # s[CC].unroll(ccr) ### Schedule C _, co, h, w = s[C].op.axis im = s[C].fuse(h, w) im, vim = s[C].split(im, P) co, vco = s[C].split(co, Q) s[C].reorder(co, im, vim, vco) if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[C].split(co, sch.bc) oaxis = oco paxis = ico s[C].parallel(paxis) s[C].pragma(oaxis, "parallel_launch_point") s[C].pragma(paxis, "parallel_stride_pattern") s[C].pragma(oaxis, "parallel_barrier_when_finish") if C1 != C: s[C1].compute_inline() s[C0].compute_at(s[C], paxis) ##### Schedule A if DOPAD: s[A0].compute_inline() s[A1].compute_inline() s[AA].compute_at(s[CC], wk) s[AA].unroll(AA.op.axis[4]) _, im, _, _, _, _ = s[A2].op.axis if sch.ba == 1: oaxis = im paxis = im else: oim, iim = s[A2].split(im, sch.ba) oaxis = oim paxis = iim s[A2].parallel(paxis) s[A2].pragma(oaxis, "parallel_launch_point") s[A2].pragma(paxis, "parallel_stride_pattern") s[A2].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule B s[BB].compute_at(s[CC], wk) s[BB].vectorize(BB.op.axis[4]) co, _, _, _, _ = s[B0].op.axis if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[B0].split(co, sch.bc) oaxis = oco paxis = ico s[B0].parallel(paxis) s[B0].pragma(oaxis, "parallel_launch_point") s[B0].pragma(paxis, "parallel_stride_pattern") s[B0].pragma(oaxis, "parallel_barrier_when_finish") return s
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): # print('Run in avx512_conv_common decl') assert layout == 'NCHW', "only support NCHW convolution on rasp" assert data.shape[ 0].value == 1, "only support batch size=1 convolution on rasp" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) if len(kernel.shape) == 4: num_filter, _, kernel_height, kernel_width = get_const_tuple( kernel.shape) else: num_filter, _, kernel_height, kernel_width, ic, oc = get_const_tuple( kernel.shape) num_filter *= oc pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # pack data # input: c, h, w shape = (batch_size, in_channel, pad_height, pad_width) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data # data_pad = tvm.compute(shape, lambda n, c, h, w: tvm.select( # tvm.all(h >= HPAD, h < pad_height - HPAD, w >= WPAD, w < pad_width - WPAD), # data[n, c, h - HPAD, w - WPAD], 0.0 # ), name='data_pad') shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width) data_vec = tvm.compute( shape, lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w], name='data_vec') # pack kernel # input: co, ci, h, w # output: gOIhw16i16o if False: shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, kernel_height, kernel_width, sch.ic_bn, sch.oc_bn) kernel_pack = tvm.compute( shape, lambda CO, CI, h, w, ci, co: kernel[CO * sch.oc_bn + co, CI * sch. ic_bn + ci, h, w], name='kernel_pack') else: kernel_pack = kernel # convolution oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ovshape = (batch_size, num_filter // sch.oc_bn, out_height, sch.oc_bn, out_width) unpack_shape = (batch_size, num_filter, out_height, out_width) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR + kh, ic % sch.ic_bn, ow * WSTR + kw ].astype(out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch.ic_bn, oc_block], axis=[ic, kh, kw]), name='conv') unpack = tvm.compute( unpack_shape, lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn], name='output_unpack', tag='conv2d_nchw') return unpack
def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_pack, conv_out, output, last): # print('Run in avx512_conv_common sch') # no stride and padding info here """ C, O0, O = conv_out, output, last batch, oc, oh, ow = s[O].op.axis s[O].parallel(batch) return s """ padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad DOPAD = (HPAD != 0 and WPAD != 0) A, W = data, kernel_pack A0, A1 = data_pad, data_vec # schedule data if DOPAD: s[A0].compute_inline() batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(ic_chunk, ih) s[A1].parallel(parallel_axis) s[A1].pragma(batch, "parallel_launch_point") s[A1].pragma(parallel_axis, "parallel_stride_pattern") s[A1].pragma(batch, "parallel_barrier_when_finish") # schedule kernel pack if False: oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) if sch.oc_bn > 1: s[W].vectorize(oc_block) parallel_axis = s[W].fuse(oc_chunk, oh) s[W].parallel(parallel_axis) s[W].pragma(parallel_axis, "parallel_launch_point") s[W].pragma(parallel_axis, "parallel_stride_pattern") s[W].pragma(parallel_axis, "parallel_barrier_when_finish") # schedule conv C, O0, O = conv_out, output, last CC = s.cache_write(C, 'global') _, oc_chunk, oh, ow, oc_block = s[C].op.axis ow_chunk, ow_block = s[C].split(ow, factor=sch.ur_w) s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) s[C].fuse(oc_chunk, oh) s[C].vectorize(oc_block) s[CC].compute_at(s[C], ow_chunk) _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic, kh, kw = s[CC].op.reduce_axis ow_chunk, ow_block = s[CC].split(ow, factor=sch.ur_w) ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) if sch.unroll_kw: s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) s[CC].unroll(kw) else: s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) s[CC].fuse(oc_chunk, oh) s[CC].vectorize(oc_block) s[CC].unroll(ow_block) if O0 != O: s[O0].compute_inline() batch, oc, oh, ow = s[O].op.axis ow_chunk, ow_block = s[O].split(ow, factor=sch.ur_w) oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn) s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh) s[C].compute_at(s[O], parallel_axis) s[O].vectorize(oc_block) s[O].parallel(parallel_axis) s[O].pragma(batch, "parallel_launch_point") s[O].pragma(parallel_axis, "parallel_stride_pattern") s[O].pragma(batch, "parallel_barrier_when_finish") return s
def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_pack, conv_out, output, last): # print('Run in avx512_conv_1x1 sch') # no stride and padding info here padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) A, W = data, kernel_pack A0, A1 = data_pad, data_vec # schedule data if A0 is not None: s[A0].compute_inline() batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(ic_chunk, ih) s[A1].parallel(parallel_axis) s[A1].pragma(batch, "parallel_launch_point") s[A1].pragma(parallel_axis, "parallel_stride_pattern") s[A1].pragma(batch, "parallel_barrier_when_finish") # schedule kernel pack oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) if sch.oc_bn > 1: s[W].vectorize(oc_block) parallel_axis = s[W].fuse(oc_chunk, oh) s[W].parallel(parallel_axis) s[W].pragma(parallel_axis, "parallel_launch_point") s[W].pragma(parallel_axis, "parallel_stride_pattern") s[W].pragma(parallel_axis, "parallel_barrier_when_finish") C, O0, O = conv_out, output, last CC = s.cache_write(C, 'global') batch, oc_chunk, oh, ow, oc_block = s[C].op.axis oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor) s[C].vectorize(oc_block) s[CC].compute_at(s[C], oh_outer) _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic, = s[CC].op.reduce_axis ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block) s[CC].vectorize(oc_block) s[CC].unroll(ow_inner) s[CC].unroll(oh_inner) if O0 != O: s[O0].compute_inline() batch, oc, oh, ow = s[O].op.axis oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn) oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor) ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor) s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh_outer) s[C].compute_at(s[O], parallel_axis) s[O].vectorize(oc_block) s[O].parallel(parallel_axis) s[O].pragma(batch, "parallel_launch_point") s[O].pragma(parallel_axis, "parallel_stride_pattern") s[O].pragma(batch, "parallel_barrier_when_finish") return s
def _spatial_get_sch(data, kernel, stride, padding, out_dtype): assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _schedule_conv2d(wkl) return (wkl, sch)
def check_device(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel_size, kernel_size), name='W') out_dtype = 'float32' wkl = _get_workload(A, W, stride, padding, out_dtype) sch = Im2ColPack(7, 8, 1, 8, True) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() # device = 'llvm' device = 'llvm -mcpu=skylake-avx512' ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): B = _im2col_pack(wkl, sch, A, W, stride, padding, out_dtype) s = tvm.create_schedule(B.op) traverse(s, B.op) op = B.op output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data_col = data_vec.op.input_tensors[0] data = data_col.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] _schedule_im2col_conv2d(wkl, sch, s, data, data_pad, data_col, data_vec, kernel, kernel_vec, conv_out, output, B) print(tvm.lower(s, [A, W, B], simple_mode=True)) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a, w, b).mean print('conv: %g secs/op' % cost) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print(b_np.shape)
def _spatial_get_sch(data, kernel, stride, padding, out_dtype): assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" return _get_workload(data, kernel, stride, padding, out_dtype)
def get_workload(data, kernel, stride, padding, out_dtype): """ Get the workload structure. """ CO, CI, KH, KW, ci, co = [x.value for x in kernel.shape] ori_kernel = tvm.placeholder((CO * co, CI * ci, KH, KW)) return _get_workload(data, ori_kernel, stride, padding, out_dtype)
def _schedule_spatial_conv2d(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last): # no stride and padding info here padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter HK, WK = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = HK - 1, WK - 1 DOPAD = (HPAD != 0 and WPAD != 0) VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll A, B, C = data, kernel, last A0, A1 = data_pad, data_vec B0 = kernel_vec C0, C1 = conv_out, output CC = s.cache_write(C0, "global") _, co, oh, ow, vh, vw, vc = s[C0].op.axis if UNROLL: s[C0].unroll(vw) s[C0].vectorize(vc) s[CC].compute_at(s[C0], ow) _, co, oh, ow, vh, vw, vc = s[CC].op.axis ci, dh, dw = s[CC].op.reduce_axis s[CC].reorder(ci, dh, vh, dw, vw, vc) if UNROLL: s[CC].unroll(vw) s[CC].vectorize(vc) ##### Schedule A if DOPAD: s[A0].compute_inline() _, h, _, _, _, _ = s[A1].op.axis if sch.ba == 1: oaxis = h paxis = h else: oh, ih = s[A1].split(h, sch.ba) oaxis = oh paxis = ih s[A1].parallel(paxis) s[A1].pragma(oaxis, "parallel_launch_point") s[A1].pragma(paxis, "parallel_stride_pattern") s[A1].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule B co, _, _, _, _ = s[B0].op.axis if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[B0].split(co, sch.bc) oaxis = oco paxis = ico s[B0].parallel(paxis) s[B0].pragma(oaxis, "parallel_launch_point") s[B0].pragma(paxis, "parallel_stride_pattern") s[B0].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule C n, co, h, w = s[C].op.axis co, vc = s[C].split(co, VC) oh, ow, vh, vw = s[C].tile(h, w, VH, VW) s[C].reorder(n, co, oh, ow, vh, vw, vc) if C != C1: s[C1].compute_inline() s[C0].compute_at(s[C], ow) if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[C].split(co, sch.bc) oaxis = oco paxis = ico s[C].parallel(paxis) s[C].pragma(oaxis, "parallel_launch_point") s[C].pragma(paxis, "parallel_stride_pattern") s[C].pragma(oaxis, "parallel_barrier_when_finish") return s