def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Depthwise convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, channels = Input.shape kernel_h, kernel_w, _, _ = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) channel_block = 4 channel_chunk = channels // channel_block num_filter_chunk = 1 # compute: Input = te.compute( [batch, in_height, in_width, channel_chunk, channel_block], lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc * 4 + icb], name="input_pack", tag="input_pack", ) Filter = te.compute( [kernel_h, kernel_w, channel_chunk, num_filter_chunk, channel_block], lambda kh, kw, ifc, nfc, cb: Filter[kh, kw, ifc * 4 + cb, nfc], name="filter_pack", tag="filter_pack", ) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, pad_top, pad_left, 0, 0] pad_after = [0, pad_down, pad_right, 0, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[ 3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[ 2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch, out_height, out_width, channel_chunk, channel_block), lambda nn, yy, xx, ffc, ffb: te.sum( (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb] * Filter[ry, rx, ffc, 0, ffb]).astype( args["accumulator"]), axis=[ry, rx], ), tag="depthwise_conv2d_nhwc", ) dummy_cast = te.compute( (batch, out_height_orig, out_width_orig, channel_chunk, channel_block), lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype), tag="dummy_cast") return te.compute((batch, out_height_orig, out_width_orig, channels), lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[ 3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[ 2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rc") rcb = te.reduce_axis((0, in_channel_block), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") # When tuning, insert a cache_read("texture") stage to properly test # performance of kernels that utlize texture inputs. The cache_read # is not needed when using the graph_runtime which supports passing # in external texture buffers. This can be removed once AutoTVM tuning # supports capturing this runtime information during task extraction # or once texture lowering in tir.TextureFlatten supports cache_read # cancellation when padding is utilized. if autotvm.GLOBAL_SCOPE.in_tuning: # NCHWc x KCRSk # texture: NCH|W|c # texture: K|CRS|k Filter_tx = te.compute( (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), ( crs // kernel_w) % kernel_h, crs % kernel_w, ffb], name="packed_filter") conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter_tx[ffc, ( (rcc * in_channel_block + rcb) * kernel_h + ry ) * kernel_w + rx, ffb]).astype(args["accumulator"]), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc", ) else: conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter[ ffc, rcc * in_channel_block + rcb, ry, rx, ffb]). astype(args["accumulator"]), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc", ) return te.compute( (batch, num_filter_chunk, out_height_orig, out_width_orig, num_filter_block), lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype), tag="cast_from_acc" + args["accumulator"][-2:])
def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Depthwise convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, channel_chunk, in_height, in_width, channel_block = Input.shape _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel_chunk = simplify(channel_chunk * channel_multiplier) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") if autotvm.GLOBAL_SCOPE.in_tuning: # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c] # NCHWc x CMRS # texture: NCH|W|c # texture: C|MRS|c Filter_tx = te.compute( (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block), lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), ( mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb], name="packed_filter") conv = te.compute( (batch, out_channel_chunk, out_height, out_width, channel_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, ffc // channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb] * Filter_tx[ffc // channel_multiplier, ( (ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, ffb]).astype(args["accumulator"]), axis=[ry, rx], ), tag="depthwise_conv2d_nchwc_kcrsk_texture", ) else: conv = te.compute( (batch, out_channel_chunk, out_height, out_width, channel_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, ffc // channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb] * Filter[ffc // channel_multiplier, ffc % channel_multiplier, ry, rx, ffb]).astype(args["accumulator"]), axis=[ry, rx], ), tag="depthwise_conv2d_nchwc_kcrsk", ) return te.compute( conv.shape, lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), tag="cast_from_acc" + args["accumulator"][-2:])
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): """Compute function for Cortex-M7 SIMD implementation of conv2d.""" assert isinstance(strides, int) or len(strides) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(strides, int): stride_h = stride_w = strides else: stride_h, stride_w = strides if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch_size, in_height, in_width, in_channels = data.shape kernel_h, kernel_w, out_channels, _ = kernel.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_data = pad(data, pad_before, pad_after, name="padded_data") rc = te.reduce_axis((0, in_channels), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch_size, out_height, out_width, out_channels), lambda nn, yy, xx, ff: te.sum( padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * kernel[ ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc], ), name="conv2d", tag="conv2d_nhwc", ) ########################### # Config Space Definition # ########################### n, oh, ow, co = ( cfg.axis(batch_size.value), cfg.axis(out_height.value), cfg.axis(out_width.value), cfg.axis(out_channels.value), ) kh, kw, ci = ( cfg.reduce_axis(kernel_h.value), cfg.reduce_axis(kernel_w.value), cfg.reduce_axis(in_channels.value), ) owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2) cio, cii = cfg.define_split( "tile_ci", ci, policy="factors", num_outputs=2, # TODO: check case with in_channels.value % 4 != 0 with AutoTVM filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0, ) coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2) cfg.define_reorder( "reorder_0_simd", [n, oh, owo, owi, coo, coi, kh, kw, cio, cii], policy="candidate", candidate=[ [n, oh, kh, kw, owo, coo, cio, owi, coi, cii], [n, oh, kh, kw, coo, owo, cio, owi, coi, cii], [n, kh, kw, oh, owo, coo, cio, owi, coi, cii], [n, kh, kw, oh, coo, owo, cio, owi, coi, cii], ], ) cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) cfg.define_knob("unroll_explicit", [0, 1]) if cfg.is_fallback: cfg.fallback_split("tile_ow", [-1, out_width.value]) cfg.fallback_split("tile_ci", [-1, in_channels.value]) cfg.fallback_split("tile_co", [-1, out_channels.value]) return conv
def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Convolution operator in NHWC layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape kernel_h, kernel_w, _, out_channels = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) in_channel_block = 4 in_channel_tail = in_channel % in_channel_block in_channel_chunk = in_channel // in_channel_block num_filter_block = 4 num_filter_tail = out_channels % num_filter_block num_filter_chunk = out_channels // num_filter_block pad_value = tvm.tir.const(0, Input.dtype) # compute: if in_channel_tail == 0: Input = te.compute( [batch, in_height, in_width, in_channel_chunk, in_channel_block], lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc * in_channel_block + icb], name="input_pack", tag="input_pack", ) else: in_channel_chunk += 1 def _reorder_data(*indices): condition = [] condition.append(indices[3] == in_channel_chunk - 1) condition.append(indices[4] >= in_channel_tail) condition = tvm.tir.all(*condition) return tvm.tir.if_then_else( condition, pad_value, Input[indices[0], indices[1], indices[2], indices[3] * in_channel_block + indices[4]]) Input = te.compute( [batch, in_height, in_width, in_channel_chunk, in_channel_block], _reorder_data, name="input_pack", tag="input_pack_expanded", ) if num_filter_tail == 0 and in_channel_tail == 0: Filter = te.compute( [ kernel_h, kernel_w, in_channel, num_filter_chunk, num_filter_block ], lambda kh, kw, ic, nfc, nfb: Filter[kh, kw, ic, nfc * num_filter_block + nfb], name="filter_pack", tag="filter_pack", ) else: num_filter_chunk += 1 # HWIO def _reorder_weights(*indices): conditionA = [] conditionA.append(indices[3] == num_filter_chunk - 1) conditionA.append(indices[4] >= num_filter_block) conditionAT = tvm.tir.all(*conditionA) conditionO = [] conditionO.append(conditionAT) conditionO.append( indices[2] >= in_channel_chunk * in_channel_block + in_channel_tail) conditionOT = tvm.tir.any(*conditionO) return tvm.tir.if_then_else( conditionOT, pad_value, Filter[indices[0], indices[1], indices[2], indices[3] * num_filter_block + indices[4]]) Filter = te.compute( [ kernel_h, kernel_w, in_channel, num_filter_chunk, num_filter_block ], _reorder_weights, name="filter_pack", tag="filter_pack_expanded", ) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, pad_top, pad_left, 0, 0] pad_after = [0, pad_down, pad_right, 0, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[ 3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[ 2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rc") rcb = te.reduce_axis((0, in_channel_block), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch, out_height, out_width, num_filter_chunk, num_filter_block), lambda nn, yy, xx, fc, fb: te.sum( (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb] * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb]).astype(args[ "accumulator"]), axis=[ry, rx, rcc, rcb], ), tag="conv2d_nhwc", ) dummy_cast = te.compute( (batch, out_height_orig, out_width_orig, num_filter_chunk, num_filter_block), lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype), tag="dummy_cast") return te.compute((batch, out_height_orig, out_width_orig, out_channels), lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): """Compute function for v7e-m DSP instructions of conv1d on NWC layout.""" if isinstance(strides, (tuple, list)): strides = strides[0] if isinstance(dilation, (tuple, list)): dilation = dilation[0] batch_size, data_width, in_channels = data.shape kernel_size, out_channels, _ = kernel.shape # Compute the output shape dilated_kernel_size = (kernel_size - 1) * dilation + 1 pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, )) out_channels = simplify(out_channels) out_width = simplify( (data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1) # Apply padding pad_before = [0, pad_left, 0] pad_after = [0, pad_right, 0] padded_data = pad(data, pad_before, pad_after, name="padded_data") # Compute graph rc = te.reduce_axis((0, in_channels), name="rc") rw = te.reduce_axis((0, kernel_size), name="rw") conv = te.compute( (batch_size, out_width, out_channels), lambda b, w, c: te.sum( padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype) * kernel[rw, c, rc].astype(out_dtype), axis=[rw, rc], ), name="conv1d", tag="conv1d_nwc", ) ########################### # Config Space Definition # ########################### n, ow, co = ( cfg.axis(batch_size.value), cfg.axis(out_width.value), cfg.axis(out_channels.value), ) kw, ci = ( cfg.reduce_axis(kernel_size.value), cfg.reduce_axis(in_channels.value), ) owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2) cio, cii = cfg.define_split( "tile_ci", ci, policy="factors", num_outputs=2, # TODO: check case with in_channels.value % 4 != 0 with AutoTVM filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0, ) coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2) cfg.define_reorder( "reorder_0_simd", [n, owo, owi, coo, coi, kw, cio, cii], policy="candidate", candidate=[ [n, kw, owo, coo, cio, owi, coi, cii], [n, kw, coo, owo, cio, owi, coi, cii], [n, kw, owo, coo, cio, owi, coi, cii], [n, kw, coo, owo, cio, owi, coi, cii], ], ) cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) cfg.define_knob("unroll_explicit", [0, 1]) if cfg.is_fallback: cfg.fallback_split("tile_ow", [-1, out_width.value]) cfg.fallback_split("tile_ci", [-1, in_channels.value]) cfg.fallback_split("tile_co", [-1, out_channels.value]) return conv
def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None): """Morphological dilation operator in NCHW layout. Parameters ---------- input : tvm.te.Tensor 4-D with shape [batch, in_channel, in_height, in_width] filter : tvm.te.Tensor 3-D with shape [ in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size dilations: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype : Optional[str] Specifies the output data type. Returns ------- Output : tvm.te.Tensor 4-D with shape [batch, in_channel, out_height, out_width] """ if out_dtype is None: out_dtype = input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilations, int) or len(dilations) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilations, int): dilation_h = dilation_w = dilations else: dilation_h, dilation_w = dilations batch, in_channel, in_height, in_width = input.shape channel, kernel_h, kernel_w = filter.shape assert (in_channel.value == channel.value ), "For Dilation2D input and filter channels should be same." # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(input, pad_before, pad_after, name="pad_temp") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") return te.compute( (batch, in_channel, out_height, out_width), lambda nn, ff, yy, xx: te.max( temp[nn, ff, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(out_dtype) + filter[ff, ry, rx].astype( out_dtype), axis=[ry, rx], ), tag="dilation2d_nchw", )
def expand_spatial_dimensions(in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w): """ Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely better parallel computation on GPU. The drawback of this solution - it will be number of useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown of extra compute and eventuially this is useful approach, at least for GPU Parameters ---------- in_height: int Height of the feature map in_width: int Width of the feature map kernel_h: int Height of the conv2d kernel kernel_w: int Width of the conv2d kernel dilation_h: int Vertical dilation of the conv2d kernel dilation_w: int Horizontal dilation of the conv2d kernel padding: tuple or list Conv2d paddings stride_h: int Vertical stride of the conv2d kernel stride_w: int Horizontal stride of the conv2d kernel """ dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which values would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 return out_height_orig, out_height, out_width_orig, out_width
def compute_conv2d_NCHWc_tpack(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channels, in_height, in_width = Input.shape out_channles, _, kernel_h, kernel_w = Filter.shape in_channel_tail = in_channels % 4 in_channel_chunk = in_channels // 4 if in_channel_tail == 0: in_channel_tail = 4 else: in_channel_chunk += 1 num_filter_block = out_channles % 4 num_filter_chunk = out_channles // 4 if num_filter_block == 0: num_filter_block = 4 else: num_filter_chunk += 1 pad_value = tvm.tir.const(0, Input.dtype) def _reorder_data(*indices): condition = [] condition.append(indices[1] == in_channel_chunk - 1) condition.append(indices[4] >= in_channel_tail) condition = tvm.tir.all(*condition) return tvm.tir.if_then_else( condition, pad_value, Input[indices[0],indices[1] * 4 + indices[4], indices[2], indices[3]]) # compute: reordered_data = te.compute( [batch, in_channel_chunk, in_height, in_width, 4], _reorder_data, name="input_pack", tag="input_pack", ) def _reorder_weights(*indices): conditionA = [] conditionA.append(indices[0] == num_filter_chunk - 1) conditionA.append(indices[4] >= num_filter_block) conditionAT = tvm.tir.all(*conditionA) conditionO = [] conditionO.append(conditionAT) conditionO.append(indices[1] >= in_channel_chunk * 4 + in_channel_tail) conditionOT = tvm.tir.any(*conditionO) return tvm.tir.if_then_else( conditionOT, pad_value, Filter[indices[0] * 4 + indices[4], indices[1], indices[2], indices[3]]) reordered_filter = te.compute( [num_filter_chunk, in_channel_chunk * 4, kernel_h, kernel_w, 4], _reorder_weights, name="filter_pack", tag="filter_pack", ) # batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape # num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w) ) out_height_orig = out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(reordered_data, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rcc") rcb = te.reduce_axis((0, 4), name="rcb") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch, num_filter_chunk, out_height, out_width, 4), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * reordered_filter[ffc, rcc * 4 + rcb, ry, rx, ffb]).astype(args["accumulator"]), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc_tpack", ) # conv = s.cache_write(conv, "local") does not work properly, it does not create # intermediate buffer, continues to read/write from global tensor as accumulator and # leads to the crash in runtime # due to this reason we had to use such dummy cast and compute_at to create such intermediate # accumulator with local scope dummy_cast = te.compute((batch, num_filter_chunk, out_height_orig, out_width_orig, 4), lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype(out_dtype), tag="dummy_cast") return te.compute((batch, out_channles, out_height_orig, out_width_orig), lambda n,c,y,x: dummy_cast[n,c // 4,y,x,c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): """Convolution operator in NCHWc layout.""" if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rc") rcb = te.reduce_axis((0, in_channel_block), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") # NCHWc x KCRSk # texture: NCH|W|c # texture: K|CRS|k # c = crs//RS # rs = crs % RS # r = rs // W == (crs // S) % R # s = rs % W == crs % S Filter = te.compute( (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), ( crs // kernel_w) % kernel_h, crs % kernel_w, ffb], name="packed_filter", ) conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter[ffc, ( (rcc * in_channel_block + rcb) * kernel_h + ry ) * kernel_w + rx, ffb]).astype(out_dtype), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc_kcrsk_texture", ) output = te.compute( conv.shape, lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32")) return output