def _fallback_schedule(N, F, Y, X): # pylint: disable=unused-argument # split N (batch dimension) if N > 1: cfg["tile_n"] = SplitEntity([-1, 1, 1, 4]) else: cfg["tile_n"] = SplitEntity([1, 1, 1, 1]) # split F (output channel dimension) if F > 1: cfg["tile_f"] = SplitEntity([-1, 1, 64, 1]) # split Y (height dimension) y_split_factor = 1 for candidate in range(5, 17): if Y % candidate == 0: y_split_factor = candidate break cfg["tile_y"] = SplitEntity([-1, 1, 1, y_split_factor]) # split X (width dimension) x_split_factor = 1 for candidate in range(5, 17): if X % candidate == 0: x_split_factor = candidate break cfg["tile_x"] = SplitEntity([-1, x_split_factor, 1, 1]) # split RC (input channel dimension, which is a reduction axis) cfg["tile_rc"] = SplitEntity([-1, 1, 16]) # other configurations cfg["fuse_yx"] = OtherOptionEntity(False) cfg["unroll_explicit"] = OtherOptionEntity(True) cfg["auto_unroll_max_step"] = OtherOptionEntity(1500)
def schedule_conv(conv): conv_data = conv.op.input_tensors[0] kernel_data = conv.op.input_tensors[1] in_type = conv_data.dtype _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape) n, w, h, c = conv.op.axis r_h, r_w = conv.op.reduce_axis ho, hi = cfg["tile_h"].apply(s, conv, h) wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) split_val = cfg["tile_c"].size[-1] use_tensorization = ((in_type == "int16") and (split_val == 8) and (IC % split_val == 0) and (channel_multiplier == 1) and is_aarch64_arm()) data_pad_value = -1 if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) # Define a strategy for padding computation cfg.define_knob("data_pad_strategy", [1, 2, 3]) if cfg.is_fallback: # We cannot inline padding when tensorizing. # So, if we can tensorize, let's compute_at the closest axis cfg["data_pad_strategy"] = (OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)) # Compute padding on the third to last axis of the computation if cfg["data_pad_strategy"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) # Compute padding on the second to last axis of the computation if cfg["data_pad_strategy"].val == 2: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], wo) # Inline padding during computation if cfg["data_pad_strategy"].val == 3: s[conv_data].compute_inline() data_pad_value = cfg["data_pad_strategy"].val if use_tensorization and data_pad_value != 3: smlal = smlal_int16_int32() s[conv].tensorize(ci, smlal) else: s[conv].vectorize(ci) if cfg["unroll_tile"].val: s[conv].unroll(r_h) s[conv].unroll(r_w) s[conv].unroll(wi) s[conv].unroll(hi) s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) return fused_n_ho
def _fallback_schedule_int8(cfg, wkl): HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 oc_bn = 16 assert wkl.out_filter % oc_bn == 0 ic_bn = 1 for bn in range(oc_bn, 0, -4): if wkl.in_filter % bn == 0: ic_bn = bn break assert wkl.in_filter % 4 == 0 reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule_int8(cfg, wkl): pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.stride_h, wkl.stride_w out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 oc_bn = 16 assert wkl.out_filter % oc_bn == 0 ic_bn = 1 for bn in range(oc_bn, 0, -4): if wkl.in_filter % bn == 0: ic_bn = bn break assert wkl.in_filter % 4 == 0 reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule(cfg, wkl): simd_width = get_simd_32bit_lanes() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.stride_h, wkl.stride_w dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if wkl.out_filter % bn == 0: oc_bn = bn break ic_bn = 1 for bn in range(oc_bn, 0, -1): if wkl.in_filter % bn == 0: ic_bn = bn break reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def schedule_conv(conv): conv_data = conv.op.input_tensors[0] n, w, h, c = conv.op.axis r_h, r_w = conv.op.reduce_axis ho, hi = cfg["tile_h"].apply(s, conv, h) wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) # Define a policy for padding computation cfg.define_knob("data_pad_inline", [1, 2, 3]) if cfg.is_fallback: cfg["data_pad_inline"] = OtherOptionEntity(3) if cfg["data_pad_inline"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) if cfg["data_pad_inline"].val == 2: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], wo) if cfg["data_pad_inline"].val == 3: s[conv_data].compute_inline() s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) s[conv].vectorize(ci) return fused_n_ho
def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() DPAD, HPAD, WPAD = wkl.dpad, wkl.hpad, wkl.wpad DSTR, HSTR, WSTR = wkl.dstride, wkl.hstride, wkl.wstride out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if wkl.out_filter % bn == 0: oc_bn = bn break ic_bn = 1 for bn in range(oc_bn, 0, -1): if wkl.in_filter % bn == 0: ic_bn = bn break reg_n = 1 for n in range(7, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if wkl.out_filter % bn == 0: oc_bn = bn break ic_bn = 1 for bn in range(oc_bn, 0, -1): if wkl.in_filter % bn == 0: ic_bn = bn break for ow_factor in range(out_width, 0, -1): if out_width % ow_factor == 0: for oh_factor in range(out_height, 0, -1): if out_height % oh_factor == 0 and ow_factor * oh_factor < 32: cfg["tile_ic"] = SplitEntity( [wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity( [wkl.out_filter // oc_bn, oc_bn]) cfg["tile_oh"] = OtherOptionEntity(oh_factor) cfg["tile_ow"] = SplitEntity( [out_width // ow_factor, ow_factor]) return raise ValueError( "cannot decide default schedule for workload: {}".format(wkl))
def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"): """Compute conv2d using CuDNN library""" if layout == "NCHW": tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) elif layout == "NHWC": tensor_format = 1 # CUDNN_TENSOR_NHWC N, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides dilation_h, dilation_w = (dilation, dilation) if isinstance( dilation, int) else dilation if (isinstance(padding, (list, tuple)) and len(padding) == 4 and (padding[0] != padding[2] or padding[1] != padding[3])): raise ValueError("Cudnn doesn't support asymmetric padding.") pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 cfg.add_flop(groups * 2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) if data.dtype == "int8" or kernel.dtype == "int8": if layout == "NCHW": raise ValueError("NCHW layout do not support int8 in cudnn") dtype = "int32" else: dtype = data.dtype cfg.define_knob("algo", range(8)) if cfg.is_fallback: # Let CUDNN choose the best algo cfg["algo"] = OtherOptionEntity(-1) return cudnn.conv_forward( data, kernel, [pt, pl], # cudnn padding pt, pl on both sides of input [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=cfg["algo"].val, conv_dtype=dtype, groups=groups, )
def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False): if is_depthwise: raise RuntimeError("Depthwise not supported for intel graphics.") batch_size, in_channel, height, width = get_const_tuple(data.shape) out_channel, _, hkernel, _ = get_const_tuple(kernel.shape) HSTR, _ = strides ic_bn = 1 oc_bn, oc_bn_upper = 16, 16 for i in range(oc_bn_upper, 0, -1): if out_channel % i == 0: oc_bn = i break if HSTR == 2: if out_channel + hkernel == 515: block_oh = 4 block_ow = 4 else: block_oh = 4 block_ow = 5 elif hkernel == 3: if out_channel == 512: block_oh = 2 block_ow = 7 else: block_oh = 2 block_ow = 14 else: block_oh = 1 block_ow = 16 cfg["tile_ic"] = SplitEntity([in_channel // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([out_channel // oc_bn, oc_bn]) cfg["block_oh"] = OtherOptionEntity(block_oh) cfg["block_ow"] = OtherOptionEntity(block_ow)
def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements): """Fallback schedule for 1x1 conv2d int8 on cpu. Normally the inner most pattern takes two int8/uint8 tensors data[num_int8_elements] and kernel[int32_lanes, num_int8_elements], produces a dot product int32/uint32 output[int32_lanes]. Parameters ---------- int32_lanes : int How many numbers of int32/uint32 will be produced using intrinsic. This is related to output channel. num_int8_elements : int How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.stride_h, wkl.stride_w out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1 out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, int32_lanes, ) assert wkl.in_filter % num_int8_elements == 0, "wkl.in_filter=%d, num_int8_elements=%d" % ( wkl.in_filter, num_int8_elements, ) oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements ic_bn = 1 for bn in range(oc_bn, 0, -4): if wkl.in_filter % bn == 0: ic_bn = bn break for ow_factor in range(out_width, 0, -1): if out_width % ow_factor == 0: for oh_factor in range(out_height, 0, -1): if out_height % oh_factor == 0 and ow_factor * oh_factor < 32: cfg["tile_ic"] = SplitEntity( [wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity( [wkl.out_filter // oc_bn, oc_bn]) cfg["tile_oh"] = OtherOptionEntity(oh_factor) cfg["tile_ow"] = SplitEntity( [out_width // ow_factor, ow_factor]) return raise ValueError( "cannot decide default schedule for workload: {}".format(wkl))
def configure_knobs(cfg, M, K): """ Configure auto-tuning knobs for the interleaved strategy """ x, y = cfg.axis(M // 4), cfg.axis(K // 16) cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]]) outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) cfg.define_annotate("A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec") # Fallback configuration if cfg.is_fallback: cfg["reorder_gemm"] = ReorderEntity([0, 1]) cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) if not is_dotprod_available(): cfg.define_knob("gemm_quantized_unroll", [True, False]) cfg.define_knob("gemm_quantized_interleave", [True, False]) if cfg.is_fallback: cfg["gemm_quantized_unroll"] = OtherOptionEntity(False) cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements): """Fallback schedule for conv2d int8 on cpu. Normally the inner most pattern takes two int8/uint8 tensors data[num_int8_elements] and kernel[int32_lanes, num_int8_elements], produces a dot product int32/uint32 output[int32_lanes]. Parameters ---------- int32_lanes : int How many numbers of int32/uint32 will be produced using intrinsic. This is related to output channel. num_int8_elements : int How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.stride_h, wkl.stride_w dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, int32_lanes, ) assert wkl.in_filter % num_int8_elements == 0, "wkl.in_filter=%d, num_int8_elements=%d" % ( wkl.in_filter, num_int8_elements, ) oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements ic_bn = 1 for bn in range(oc_bn, 0, -4): if wkl.in_filter % bn == 0: ic_bn = bn break reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule(cfg, wkl): """ Get default schedule for the workload Parameters ---------- cfg : tvm.autotvm.task.space.FallbackConfigEntity Fallback config to be updated wkl : topi.nn.depthwise_conv2d.Workload Convolution workload """ simd_width = get_simd_32bit_lanes() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.stride_h, wkl.stride_w dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if wkl.out_filter % bn == 0: oc_bn = bn break ic_bn = 1 for bn in range(oc_bn, 0, -1): if wkl.in_filter % bn == 0: ic_bn = bn break reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule(cfg, wkl): simd_width = 4 # assume ARM SIMD Width is 4 pad_left, pad_right = wkl.padl, wkl.padr stride_w = wkl.stride_w out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1 groups = wkl.groups kernels_per_group = wkl.out_filter // groups kernel_depth = wkl.in_filter // groups oc_bn = 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if kernels_per_group % bn == 0: oc_bn = bn break if oc_bn > kernels_per_group: oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): if kernel_depth % bn == 0: ic_bn = bn break if ic_bn > kernel_depth: ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule(cfg, wkl): """ Get default schedule for the workload Parameters ---------- cfg : tvm.autotvm.task.space.FallbackConfigEntity Fallback config to be updated wkl : topi.nn.depthwise_conv2d.Workload Convolution workload """ simd_width = get_fp32_len() HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if wkl.out_filter % bn == 0: oc_bn = bn break ic_bn = 1 for bn in range(oc_bn, 0, -1): if wkl.in_filter % bn == 0: ic_bn = bn break reg_n = 1 for n in range(31, 0, -1): if out_width % n == 0: reg_n = n break cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) cfg["unroll_kw"] = OtherOptionEntity(False)
def _fallback_schedule(cfg, wkl): simd_width = get_simd_32bit_lanes() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.stride_h, wkl.stride_w dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1 dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): if wkl.out_filter % bn == 0: oc_bn = bn break ic_bn = 1 for bn in range(oc_bn, 0, -1): if wkl.in_filter % bn == 0: ic_bn = bn break for ow_factor in range(out_width, 0, -1): if out_width % ow_factor == 0: for oh_factor in range(out_height, 0, -1): if out_height % oh_factor == 0 and ow_factor * oh_factor < 32: cfg["tile_ic"] = SplitEntity( [wkl.in_filter // ic_bn, ic_bn]) cfg["tile_oc"] = SplitEntity( [wkl.out_filter // oc_bn, oc_bn]) cfg["tile_oh"] = OtherOptionEntity(oh_factor) cfg["tile_ow"] = SplitEntity( [out_width // ow_factor, ow_factor]) return raise ValueError( "cannot decide default schedule for workload: {}".format(wkl))
def schedule_depthwise_conv2d_nhwc(cfg, outs): """Create the schedule for depthwise_conv2d_nchw_spatial_pack""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) out = outs[0] ##### space definition begin ##### n, h, w, c = s[out].op.axis # Split the number of input/output channels cfg.define_split("tile_c", c, num_outputs=2) # Split the height of the convolution _, hi = cfg.define_split("tile_h", h, num_outputs=2) # Split the width of the convolution _, wi = cfg.define_split("tile_w", w, num_outputs=2) # Additional out (e.g., requantization, bias addition, etc..) # 0: locate the output on the second last axis of the main compuation # 1: locate the output closest to the main computation cfg.define_knob("locate_output", [0, 1]) # Determine if we should unroll the computation of the inner tile cfg.define_knob("unroll_tile", [True, False]) # fallback support if cfg.is_fallback: cfg["tile_c"] = SplitEntity([-1, 8]) cfg["tile_h"] = SplitEntity([-1, 2]) cfg["tile_w"] = SplitEntity([-1, 2]) cfg["locate_output"] = OtherOptionEntity(1) cfg["unroll_tile"] = OtherOptionEntity(True) ##### space definition end ##### def schedule_conv(conv): conv_data = conv.op.input_tensors[0] kernel_data = conv.op.input_tensors[1] in_type = conv_data.dtype _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape) n, w, h, c = conv.op.axis r_h, r_w = conv.op.reduce_axis ho, hi = cfg["tile_h"].apply(s, conv, h) wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) split_val = cfg["tile_c"].size[-1] use_tensorization = ( (in_type == "int16") and (split_val == 8) and (IC % split_val == 0) and (channel_multiplier == 1) and is_aarch64_arm() ) data_pad_value = -1 if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) # Define a strategy for padding computation cfg.define_knob("data_pad_strategy", [1, 2, 3]) if cfg.is_fallback: # We cannot inline padding when tensorizing. # So, if we can tensorize, let's compute_at the closest axis cfg["data_pad_strategy"] = ( OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3) ) # Compute padding on the third to last axis of the computation if cfg["data_pad_strategy"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) # Compute padding on the second to last axis of the computation if cfg["data_pad_strategy"].val == 2: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], wo) # Inline padding during computation if cfg["data_pad_strategy"].val == 3: s[conv_data].compute_inline() data_pad_value = cfg["data_pad_strategy"].val if use_tensorization and data_pad_value != 3: smlal = smlal_int16_int32() s[conv].tensorize(ci, smlal) else: s[conv].vectorize(ci) if cfg["unroll_tile"].val: s[conv].unroll(r_h) s[conv].unroll(r_w) s[conv].unroll(wi) s[conv].unroll(hi) s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) return fused_n_ho def schedule_conv_out(out): n, h, w, c = out.op.axis co, ci = cfg["tile_c"].apply(s, out, c) wo, wi = cfg["tile_w"].apply(s, out, w) ho, hi = cfg["tile_h"].apply(s, out, h) s[out].reorder(n, ho, wo, co, hi, wi, ci) if cfg["unroll_tile"]: s[out].unroll(wi) s[out].unroll(hi) if out.dtype in ["int8", "uint8"]: # In case of quantized convolution further split the channel in batches of 4 elements # so that we can use arm intrinsics to run fixed_point_multiplication ci_outer, ci_inner = s[out].split(ci, 4) s[out].vectorize(ci_inner) s[out].unroll(ci_outer) fused_n_ho = s[out].fuse(n, ho) return hi, wi, fused_n_ho def _callback(op): if op.name == "depthwise_conv2d_nhwc_output": conv = op.output(0) if conv != out: hi, wi, p_axis = schedule_conv_out(out) schedule_conv(conv) if cfg["locate_output"].val == 0: s[conv].compute_at(s[out], hi) if cfg["locate_output"].val == 1: s[conv].compute_at(s[out], wi) else: p_axis = schedule_conv(out) s[out].parallel(p_axis) traverse_inline(s, outs[0].op, _callback) return s
def schedule_depthwise_conv2d_nhwc(cfg, outs): """Create the schedule for depthwise_conv2d_nchw_spatial_pack""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) out = outs[0] ##### space definition begin ##### n, h, w, c = s[out].op.axis cfg.define_split("tile_c", c, num_outputs=2) _, hi = cfg.define_split("tile_h", h, num_outputs=2) _, wi = cfg.define_split("tile_w", w, num_outputs=2) cfg.define_knob("locate_output", [0, 1]) # fallback support if cfg.is_fallback: cfg["tile_c"] = SplitEntity([-1, 8]) cfg["tile_h"] = SplitEntity([-1, 2]) cfg["tile_w"] = SplitEntity([-1, 2]) cfg["locate_output"] = OtherOptionEntity(1) ##### space definition end ##### def schedule_conv(conv): conv_data = conv.op.input_tensors[0] n, w, h, c = conv.op.axis r_h, r_w = conv.op.reduce_axis ho, hi = cfg["tile_h"].apply(s, conv, h) wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) # Define a policy for padding computation cfg.define_knob("data_pad_inline", [1, 2, 3]) if cfg.is_fallback: cfg["data_pad_inline"] = OtherOptionEntity(3) if cfg["data_pad_inline"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) if cfg["data_pad_inline"].val == 2: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], wo) if cfg["data_pad_inline"].val == 3: s[conv_data].compute_inline() s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) s[conv].vectorize(ci) return fused_n_ho def schedule_conv_out(out): n, h, w, c = out.op.axis co, ci = cfg["tile_c"].apply(s, out, c) wo, wi = cfg["tile_w"].apply(s, out, w) ho, hi = cfg["tile_h"].apply(s, out, h) s[out].reorder(n, ho, wo, co, hi, wi) if out.dtype in ["int8", "uint8"]: # In case of quantized convolution further split the channel in batches of 4 elements # so that we can use arm intrinsics to run fixed_point_multiplication ci_outer, ci_inner = s[out].split(ci, 4) s[out].vectorize(ci_inner) fused_n_ho = s[out].fuse(n, ho) return hi, wi, fused_n_ho def _callback(op): if op.name == "depthwise_conv2d_nhwc_output": conv = op.output(0) if conv != out: hi, wi, p_axis = schedule_conv_out(out) schedule_conv(conv) if cfg["locate_output"].val == 0: s[conv].compute_at(s[out], hi) if cfg["locate_output"].val == 1: s[conv].compute_at(s[out], wi) else: p_axis = schedule_conv(out) s[out].parallel(p_axis) traverse_inline(s, outs[0].op, _callback) return s
def _schedule(cfg, op): C = op.output(0) A, B = s[C].op.input_tensors if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A: s[B].compute_inline() _, M, N = get_const_tuple(C.shape) AA = s.cache_read(A, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BB = s.cache_read(B, "shared", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") if op not in s.outputs: s[C].compute_inline() C = s.outputs[0].output(0) b, y, x = s[C].op.axis (k, ) = s[CC].op.reduce_axis cfg.define_split("tile_y", y, num_outputs=3) cfg.define_split("tile_x", x, num_outputs=3) cfg.define_split("tile_k", k, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [8, 16, 32, 64]) target = tvm.target.Target.current() if target.kind.name in ["nvptx", "rocm"]: # llvm-based backends cannot do non-explicit unrolling cfg.define_knob("unroll_explicit", [1]) else: cfg.define_knob("unroll_explicit", [0, 1]) if cfg.is_fallback: y_bn = get_max_power2_factor(M, 64) x_bn = get_max_power2_factor(N, 64) y_nthreads = min(y_bn, 8) x_nthreads = min(x_bn, 8) cfg["tile_x"] = SplitEntity([-1, x_nthreads, x_bn // x_nthreads]) cfg["tile_y"] = SplitEntity([-1, y_nthreads, y_bn // y_nthreads]) cfg["tile_k"] = SplitEntity([-1, 8]) cfg["auto_unroll_max_step"] = OtherOptionEntity(16) by, ty, yi = cfg["tile_y"].apply(s, C, y) bx, tx, xi = cfg["tile_x"].apply(s, C, x) thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") s[C].reorder(b, by, bx, ty, tx, yi, xi) s[C].bind(b, te.thread_axis("blockIdx.z")) s[C].bind(by, te.thread_axis("blockIdx.y")) s[C].bind(bx, te.thread_axis("blockIdx.x")) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].pragma(yi, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[C].pragma(yi, "unroll_explicit", cfg["unroll_explicit"].val) s[CC].compute_at(s[C], tx) _, yi, xi = s[CC].op.axis ko, ki = cfg["tile_k"].apply(s, CC, k) s[CC].reorder(ko, ki, yi, xi) s[CC].pragma(ki, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[CC].pragma(ki, "unroll_explicit", cfg["unroll_explicit"].val) s[AA].compute_at(s[CC], ko) s[AL].compute_at(s[CC], ki) s[BB].compute_at(s[CC], ko) s[BL].compute_at(s[CC], ki) _, y, k = s[AA].op.axis ty, yi = s[AA].split(y, nparts=cfg["tile_y"].size[1]) tx, ki = s[AA].split(k, nparts=cfg["tile_x"].size[1]) s[AA].reorder(ty, tx, yi, ki) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) s[AA].pragma(yi, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[AA].pragma(yi, "unroll_explicit", cfg["unroll_explicit"].val) _, x, k = s[BB].op.axis ty, xi = s[BB].split(x, nparts=cfg["tile_y"].size[1]) tx, ki = s[BB].split(k, nparts=cfg["tile_x"].size[1]) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].reorder(ty, tx, xi, ki) s[BB].pragma(xi, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[BB].pragma(xi, "unroll_explicit", cfg["unroll_explicit"].val)
def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels): """Compute conv2d by transforming the input, executing GEMM and transforming the output back""" batches, IH, IW, IC = get_const_tuple(data.shape) KH, KW = get_const_tuple(kernel_size) OC = get_const_int(output_channels) K_AREA = KH * KW if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = get_const_tuple(dilation) dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 if pad_top or pad_left: data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0], name="data_pad") else: data_pad = data # --- Im2col M = OH * OW K = IC * K_AREA N = OC A_shape = (batches, M, K) if K_AREA == 1: A = te.compute( A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y], name="data_flatten", ) else: A = te.compute( A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW) + dilation_h * ( (y // IC) // KW), WSTR * (x % OW) + dilation_w * ((y // IC) % KW), y % IC, ], name="data_im2col", ) N_transformed = B_interleaved_t.shape[0] # --- Pad if necessary idxm = tvm.tir.indexmod pad_m = 0 pad_k = 0 if M % 4 != 0: pad_m = 4 - (M % 4) if K % 16 != 0: pad_k = 16 - (K % 16) M_padded = M + pad_m K_padded = K + pad_k pad_before = (0, 0, 0) pad_after = (0, pad_m, pad_k) if pad_m != 0 or pad_k != 0: A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded") # --- GEMM: A*B' k = te.reduce_axis((0, K_padded), "k") A_interleaved = te.compute( (batches, M_padded // 4, K_padded // 16, 4, 16), lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y], name="A_interleaved", ) C_interleaved = te.compute( (batches, M_padded // 4, N_transformed, 4, 4), lambda b, x, y, w, z: te.sum( A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype) * B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype), axis=k, ), name="C_interleaved", ) # --- Unpack C C = te.compute( (batches, M, N), lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)], name="C", ) # --- Produce the conv output out_shape = (batches, OH, OW, OC) out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name="conv2d_gemm_output") # Configuration space x, y = cfg.axis(M_padded // 4), cfg.axis(K_padded // 16) cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]]) outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) cfg.define_annotate("A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec") cfg.define_knob("gemm_quantized_unroll", [True, False]) cfg.define_knob("gemm_quantized_interleave", [True, False]) # Fallback configuration if cfg.is_fallback: cfg["reorder_gemm"] = ReorderEntity([0, 1]) cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) cfg["gemm_quantized_unroll"] = OtherOptionEntity(False) cfg["gemm_quantized_interleave"] = OtherOptionEntity(True) return out
def conv2d_cudnn( cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32" ): """Compute conv2d using CuDNN library""" if layout == "NCHW": tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) elif layout == "NHWC": tensor_format = 1 # CUDNN_TENSOR_NHWC N, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation KH_dilated = (KH - 1) * dilation_h + 1 KW_dilated = (KW - 1) * dilation_h + 1 pt, pl, pb, pr = get_pad_tuple(padding, (KH_dilated, KW_dilated)) if (pt != pb) or (pl != pr): raise ValueError("Cudnn doesn't support asymmetric padding.") OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 if isinstance(N, int): cfg.add_flop( groups * 2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1) ) if data.dtype == "int8" or kernel.dtype == "int8": if layout == "NCHW": raise ValueError("NCHW layout do not support int8 in cudnn") dtype = "int32" else: dtype = data.dtype cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT"))) if cfg.is_fallback: if cudnn.exists(): # Let CUDNN choose the best algo, based on benchmarks run # on the local machine. In the future, this should be # based on parameters stored in the Target. cfg["algo"] = OtherOptionEntity(-1) else: cfg["algo"] = OtherOptionEntity(0) return cudnn.conv_forward( data, kernel, [pt, pl], # cudnn padding pt, pl on both sides of input [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=cfg["algo"].val, conv_dtype=dtype, groups=groups, )