def _conv_schedule_asm(outs): """_conv_schedule_asm""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if "asm_conv2d_output" in op.tag: # schedule conv2d output = op.output(0) conv = op.input_tensors[0] sidx = 0 if conv.op.input_tensors[0].name == "attr": sidx = 1 data_vec = conv.op.input_tensors[sidx] data_pad = data_vec.op.input_tensors[0] s[data_pad].compute_inline() kernel_vec = conv.op.input_tensors[sidx + 1] if kernel_vec.op.name == "kernel_vec": kernel = kernel_vec.op.input_tensors[0] else: kernel = kernel_vec if (isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag): s[kernel].compute_inline() if conv.op.input_tensors[0].name == "attr": _schedule_asm(s, data_vec, kernel_vec, conv, output, outs[0]) else: _schedule_asm(s, data_vec, kernel_vec, conv, output, outs[0]) traverse_inline(s, outs[0].op, _callback) return s
def schedule_sparse_dense_cuda_allreduce_autotune(cfg, outs): """Create schedule for sparse dense""" s = te.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == "sparse_dense_bsrmm": y_bsrmm = op.input_tensors[0] w_indptr = y_bsrmm.op.input_tensors[0] assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block" y_reshape = op (m, num_blocks, b_r) = s[y_bsrmm].op.axis bs_r = get_const_int(b_r.dom.extent) (elem_idx, c) = s[y_bsrmm].op.reduce_axis (m_o, n_o) = s[y_reshape].op.axis s[y_reshape].bind(m_o, te.thread_axis("blockIdx.x")) s[y_reshape].bind(n_o, te.thread_axis("blockIdx.y")) s[y_bsrmm].compute_at(s[y_reshape], n_o) thread_x = te.thread_axis("threadIdx.x") cfg.define_split("tile_c", c, num_outputs=2) co, ci = cfg['tile_c'].apply(s, y_bsrmm, c) y_bsrmm_factored = s.rfactor(y_bsrmm, ci) tx = s[y_bsrmm].op.reduce_axis[0] s[y_bsrmm].bind(tx, thread_x) s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx) s[y_bsrmm].set_store_predicate(thread_x.var.equal(0)) s[y_reshape].set_store_predicate(thread_x.var.equal(0)) traverse_inline(s, outs[0].op, _callback) return s
def _conv_schedule_deconv(cfg, outs): """schedule_conv2d_nchw_arm_cpu_deconv inner implementation""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if "deconv_conv2d_output" in op.tag: # schedule conv2d output = op.output(0) conv = op.input_tensors[0] sidx = 0 if conv.op.input_tensors[0].name == "attr": sidx = 1 data_vec = conv.op.input_tensors[sidx] kernel_vec = conv.op.input_tensors[sidx + 1] if kernel_vec.op.name == "kernel_vec": kernel = kernel_vec.op.input_tensors[0] else: kernel = kernel_vec if (isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag): s[kernel].compute_inline() _schedule_deconv(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) traverse_inline(s, outs[0].op, _callback) return s
def conv2d_NCHW16c_OHWI16o_schedule(attrs, outs, target): from topi.util import traverse_inline sch = te.create_schedule([i.op for i in outs]) output = outs[0].op def callback(op): nonlocal sch if len(list(op.reduce_axis)): a, b = op.input_tensors tune.ashape = get_const_tuple(a.shape) tune.bshape = get_const_tuple(b.shape) conv = op.output(0) n, c, h, w, _ = get_const_tuple(conv.shape) stride_h, stride_w = attrs.get_int_tuple('strides') tune.strides = (stride_h, stride_w) ky = tune.ashape, tune.bshape, (stride_h, stride_w) if tune.enable and ky in tune.cuda_kernel.keys(): tune.splitk = int(tune.cuda_kernel[ky]) if w % 32 == 0: _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w) else: assert h * w % 32 == 0 and 32 % w == 0 _conv2d_schedule_fused(sch, conv, output, stride_h, stride_w) traverse_inline(sch, output, callback) tune.splitk = None return sch
def _schedule_custom_dense_pack(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if "dense_pack" in op.tag: _schedule_custom_dense_pack_template(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s
def conv2d_direct_nhwc_schedule(cfg, outs): """Schedule function for directly-scheduled conv2d on NHWC layout.""" sched = tvm.create_schedule([x.op for x in outs]) def _callback(op): if 'conv2d_nhwc' not in op.tag: return ### extract tensors ### output = op.output(0) conv = op data_vec = conv.input_tensors[0] kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis # NOTE we can't inline data padding in the SIMD path, because it # introduces conditionals in the inner loop. data_pad = data_vec.op sched[data_pad].compute_inline() co, vc = cfg['tile_co'].apply(sched, conv, co) oh, vh = cfg['tile_oh'].apply(sched, conv, oh) ow, vw = cfg['tile_ow'].apply(sched, conv, ow) cfg['reorder_0'].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc]) cfg['ann_reduce'].apply(sched, conv, [kh, kw], axis_lens=[ get_const_int(kh.dom.extent), get_const_int(kw.dom.extent) ], max_unroll=8, cfg=cfg) cfg['ann_spatial'].apply(sched, conv, [vh, vw, vc], axis_lens=[ cfg['tile_oh'].size[-1], cfg['tile_ow'].size[-1], cfg['tile_co'].size[-1] ], max_unroll=8, cfg=cfg) kernel_scope = n # this is the scope to attach global config inside this kernel # tune unroll sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) traverse_inline(sched, outs[-1].op, _callback) return sched
def schedule_sparse_dense_cuda_baseline(outs): """Create schedule for sparse dense""" s = te.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == "sparse_dense_bsrmm": y_bsrmm = op.input_tensors[0] assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block" y_reshape = op (m, num_blocks, b_r) = s[y_bsrmm].op.axis bs_r = get_const_int(b_r.dom.extent) (elem_idx, c) = s[y_bsrmm].op.reduce_axis s[y_reshape].bind(s[y_reshape].op.axis[0], te.thread_axis("blockIdx.x")) s[y_reshape].bind(s[y_reshape].op.axis[1], te.thread_axis("threadIdx.x")) s[y_bsrmm].compute_at(s[y_reshape], s[y_reshape].op.axis[1]) traverse_inline(s, outs[0].op, _callback) return s
def conv2d_direct_simd_nhwc_schedule(cfg, outs): """Schedule function for Cortex-M7 SIMD implementation of conv2d.""" sched = te.create_schedule([x.op for x in outs]) def _callback(op): if 'conv2d_nhwc' not in op.tag: return # extract tensors output = op.output(0) conv = op data_vec = conv.input_tensors[0] kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis M = cfg['tile_ow'].size[-1] K = cfg['tile_ci'].size[-1] N = cfg['tile_co'].size[-1] owo, owi = cfg['tile_ow'].apply(sched, conv, ow) cio, cii = cfg['tile_ci'].apply(sched, conv, ci) coo, coi = cfg['tile_co'].apply(sched, conv, co) cfg['reorder_0_simd'].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii]) gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype) sched[output].tensorize(owi, gemm) sched[output].pragma(n, 'import_c', gemm_MxKxN_impl(M, K, N, uniq_id)) # this is the scope to attach global config inside this kernel kernel_scope = n # tune unroll sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) traverse_inline(sched, outs[-1].op, _callback) return sched
def conv2d_NCHW16c_OHWI16o_schedule(attrs, outs, target): from topi.util import traverse_inline sch = te.create_schedule([i.op for i in outs]) output = outs[0].op def callback(op): nonlocal sch if len(list(op.reduce_axis)): a, b = op.input_tensors conv = op.output(0) n, c, h, w, _ = get_const_tuple(conv.shape) stride_h, stride_w = attrs.get_int_tuple('strides') if w % 32 == 0: _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w) else: assert h * w % 32 == 0 and 32 % w == 0 _conv2d_schedule_fused(sch, conv, output, stride_h, stride_w) traverse_inline(sch, output, callback) return sch
def _matmul_schedule_asm(cfg, outs): """schedule_conv2d_nchw schedule implementation""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if "asm_matmul_output" in op.tag: # schedule conv2d output = op.output(0) mat = op.input_tensors[0] sidx = 0 if mat.op.input_tensors[0].name == "attr": sidx = 1 a_vec = mat.op.input_tensors[sidx] b_vec = mat.op.input_tensors[sidx + 1] def recurs_inline(a_): if a_.op.input_tensors: a1 = a_.op.input_tensors[0] if a1.shape == a_.shape: s[a1].compute_inline() recurs_inline(a1) def recurs_inline_(a_): if isinstance(a_, tvm.tensor.ComputeOp): if a_.op.input_tensors: a1 = a_.op.input_tensors[0] s[a1].compute_inline() recurs_inline_(a1) recurs_inline_(a_vec) recurs_inline_(b_vec) _schedule_asm(cfg, s, a_vec, b_vec, mat, output, outs[0]) traverse_inline(s, outs[0].op, _callback) return s
def _depthwise_schedule_spatial_pack(cfg, outs): """schedule_depthwise_conv2d_nchw_arm's inner implement""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == "spatial_depthwise_conv_nchw_output": output = op.output(0) conv = op.input_tensors[0] data_vec = conv.op.input_tensors[0] kernel_vec = conv.op.input_tensors[1] if kernel_vec.op.name == "kernel_vec": kernel = kernel_vec.op.input_tensors[0] else: kernel = kernel_vec if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag: s[kernel].compute_inline() _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) traverse_inline(s, outs[0].op, _callback) return s
def schedule(outs, strides, pattern, pragma, max_threads): from topi.util import traverse_inline sch = tvm.te.create_schedule([i.op for i in outs]) output = outs[0].op def callback(op): if len(list(op.reduce_axis)): from .looptiler import analyze_tiling points = list(analyze_tiling(op, pattern, max_parallel=max_threads)) fobj = lambda x: (2**-x[0]) * (2**-x[1]) * x[2] * (x[3] * x[ 3] if 2 <= x[3] <= 8 else 1.0 / x[3]) points.sort(key=fobj) points = points[::-1] #for x in points[::-1]: # print((2 ** -x[0]), (2 ** -x[1]), x[2], (x[3] * x[3] if 2 <= x[3] <= 8 else 1.0 / x[3])) # print(x[-1]) a, b = op.input_tensors tune.ashape = get_const_tuple(a.shape) tune.bshape = get_const_tuple(b.shape) try: tune.strides = strides except: tune.strides = 'dense' if tune.cpu_idx is None: to_apply = points[0][-1] import os HOME = os.getenv("HOME") try: f = open(HOME + '/Tensorization-PoC/cpu-shapes.log', 'a') except: f = open(HOME + '/UNIT/cpu-shapes.log', 'a') f.write(f'{tune.ashape} {tune.bshape} {tune.strides}\n') if (tune.ashape, tune.bshape, tune.strides) in tune.x86.keys(): to_apply = points[tune.x86[(tune.ashape, tune.bshape, tune.strides)]][-1] else: tune.total_idx = len(points) to_apply = points[tune.cpu_idx][-1] to_schedule = output loops = [] parallel_level = None for i in range(len(output.axis)): if isinstance(to_apply[i][0], tuple) and to_apply[i][0][1] == 'parallel': to_schedule = op if str(op) != str(output): outer, inner = sch[output].split( output.axis[i], nparts=to_apply[i][0][0]) parallel_level = outer sch[op].compute_at(sch[output], outer) if i == len(output.axis) - 1: sch[output].vectorize(inner) else: sch[output].vectorize(output.axis[-1]) to_append = [] to_split = to_schedule.axis[i] for j in to_apply[i][1:][::-1]: if isinstance(j, int): outer, inner = sch[to_schedule].split(to_split, j) to_split = outer else: outer, inner = sch[to_schedule].split(to_split, j[0]) to_split = outer to_append = [inner] + to_append to_append = [to_split] + to_append loops += to_append for i in range(len(op.reduce_axis)): to_split = op.reduce_axis[i] to_append = [] for j in to_apply[i + len(op.axis)][1:][::-1]: if isinstance(j, int): outer, inner = sch[op].split(to_split, j) to_split = outer else: outer, inner = sch[op].split(to_split, j[0]) to_split = outer to_append = [inner] + to_append to_append = [to_split] + to_append loops += to_append annot = [] for i, elem in enumerate(to_apply): for j in elem: if isinstance(j, int): annot.append(None if i < len(op.axis) else 'reduce') else: annot.append(j[1]) assert len(annot) == len(loops), '%d != %d' % (len(annot), len(loops)) unroll, stencil, simple, reduction = [], [], [], [] for i, elem in enumerate(zip(annot, loops)): # print(elem) hint, axis = elem if unroll and hint is None: unroll.append(axis) elif hint == 'parallel': fusion = sch[output].fuse(*(simple + [ parallel_level if parallel_level is not None else axis ])) sch[output].parallel(fusion) if str(op) != str(output): sch[op].compute_at(sch[output], fusion) simple = [] elif hint == 'unroll': unroll.append(axis) elif hint == 'offload': stencil.append(axis) elif hint == 'reduction': reduction.append(axis) else: simple.append(axis) sch[op].pragma(stencil[0], 'tensorize', pragma) if tune.parallel_only: if str(op) != str(output): sch[op].reorder(*(simple + unroll + reduction + stencil)) else: sch[op].reorder(*([fusion] + unroll + simple + reduction + stencil)) return for i in unroll: sch[op].unroll(i) #if simple: # unroll = [simple[-1]] + unroll # simple = simple[:-1] if str(op) != str(output): sch[op].reorder(*(simple + reduction + unroll + stencil)) else: sch[op].reorder(*([fusion] + simple + reduction + unroll + stencil)) traverse_inline(sch, output, callback) return sch