def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_packv1_writecachev1(C): bn = 32 packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) # Allocate write cache CC = s.cache_write(C, 'global') xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) # Write cache is computed at yo s[CC].compute_at(s[C], yo) ko, ki = s[CC].split(s[CC].op.reduce_axis[0], factor=32,nparts =None) # New inner axes xc, yc = s[CC].op.axis s[CC].reorder(ko, xc, ki, yc) s[CC].vectorize(yc) #s[CC].parallel(ko)不能加??? s[CC].unroll(ki) s[C].parallel(xo) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) s[packedB].unroll(y) return s
def Gemm_v4(C): # Tiling(blocking)、reordering、Vectorization、Loop Permutation、Array Packing s = tvm.create_schedule( C.op) # Tiling、reordering、Vectorization、Loop Permutation、Array Packing bn = 32 k, = s[C].op.reduce_axis packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute( (M, N), lambda x, y: tvm.sum( A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) ko, ki = s[C].split(k, factor=4) s[C].reorder(xo, yo, ko, xi, ki, yi) s[C].vectorize(yi) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) print("Tiling、reordering、Vectorization、Loop Permutation、Array Packing") return s
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_packv1(C): bn = 32 packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute( (M, N), lambda x, y: tvm.sum( A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) #tile接口只接受二维平铺 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) #通过这种方式对第三维的K进行划分 ko, ki = s[C].split(s[C].op.reduce_axis[0], factor=32, nparts=None) #注意这里的bn和factor都对应的是axis的inner的取值范围 #这里我们试试o顺序变了有啥变化没 s[C].reorder(xo, yo, ko, xi, ki, yi) #实验表明(无论从可行性和性能上都)应该针对最内层 s[C].vectorize(yi) #实验表明,应当针对最外层的循环轴进行parallel s[C].parallel(xo) #实验表明,unroll用于倒数第二层效果好 s[C].unroll(ki) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) s[packedB].unroll(y) return s
def Gemm_v6(C): # Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks、Parallel bn = 32 s = tvm.create_schedule(C.op) k, = s[C].op.reduce_axis packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute( (M, N), lambda x, y: tvm.sum( A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) # Futhermore, we can also utilize multi-core processors to do the thread-level parallelization. CC = s.cache_write(C, 'global') xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) s[CC].compute_at(s[C], yo) xc, yc = s[CC].op.axis k, = s[CC].op.reduce_axis ko, ki = s[CC].split(k, factor=4) s[CC].reorder(ko, xc, ki, yc) s[CC].unroll(ki) s[CC].vectorize(yc) # parallel s[C].parallel(xo) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) print( "Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks、Parallel" ) return s
def Gemm_v5(C): # Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks bn = 32 s = tvm.create_schedule(C.op) k, = s[C].op.reduce_axis packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute( (M, N), lambda x, y: tvm.sum( A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) # Allocate write cache CC = s.cache_write(C, 'global') xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) # Write cache is computed at yo s[CC].compute_at(s[C], yo) # New inner axes xc, yc = s[CC].op.axis k, = s[CC].op.reduce_axis ko, ki = s[CC].split(k, factor=4) s[CC].reorder(ko, xc, ki, yc) s[CC].unroll(ki) s[CC].vectorize(yc) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) print( "Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks" ) return s
def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype): """Transposed 1D convolution ncw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 3-D with shape [batch, in_channel, inp_width] Filter : tvm.Tensor 3-D with shape [in_channel, num_filter, kernel_size] stride : tuple of one int The spatial stride along width padding : int, tuple, or string int: padding size tuple of 2 ints: (pad_left, pad_right) for left and right padding string: ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor u 3-D with shape [batch, out_channel, out_width] """ if isinstance(stride, (tuple, list)): stride = stride[0] cfg.stride = stride batch, inp_channels, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_size = get_const_tuple(kernel.shape) pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size) out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right pad_left = kernel_size - 1 - pad_left pad_right = kernel_size - 1 - pad_right dilated_width = stride * (inp_width - 1) + 1 data = tvm.compute( (batch, inp_channels, pad_left + dilated_width + pad_right), lambda n, c, x: tvm.if_then_else( tvm.all(x >= pad_left, x < pad_left + dilated_width, tvm.indexmod(x - pad_left, stride).equal(0)), data[ n, c, tvm.indexdiv(x - pad_left, stride)], tvm.const(0., "float32")), name='data_pad') dc = tvm.reduce_axis((0, inp_channels), name='dc') dw = tvm.reduce_axis((0, kernel_size), name='dw') data_out = tvm.compute( (batch, out_channels, out_width), lambda b, c, w: tvm.sum(data[b, dc, w + dw].astype(out_dtype) * kernel[ dc, c, kernel_size - 1 - dw].astype(out_dtype), axis=[dc, dw]), tag="conv1d_transpose_ncw") return data_out
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_packv1_writecachev1_config_define( N, K, M, dtype): A = tvm.placeholder((N, K), name='A', dtype=dtype) B = tvm.placeholder((K, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, K), name='k') bn = 32 packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute( (M, N), lambda x, y: tvm.sum( A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) k = s[C].op.reduce_axis[0] C = s.cache_write(C, 'global') y, x = s[C].op.axis cfg = autotvm.get_config() #cfg.define_knob("bn",candidate=[1,2,4,8,16,32,64]) cfg.define_split("tile_x", x, num_outputs=2) cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_k", k, num_outputs=2) # cfg.define_split('tile_x', x, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_split('tile_k', k, policy='factors', filter=lambda x: x.size[-1] <= 64) xo, xi = cfg["tile_x"].apply(s, C, x) yo, yi = cfg["tile_y"].apply(s, C, y) ko, ki = cfg["tile_k"].apply(s, C, k) # cfg.define_knob("tile_x", [1, 4, 8, 16, 32, 64]) # cfg.define_knob("tile_y", [1, 4, 8, 16, 32, 64]) # cfg.define_knob("tile_k", [1, 4, 8, 16, 32, 64]) # xo, xi = s[C].split(x, cfg['tile_x'].val) # yo, yi = s[C].split(y, cfg['tile_y'].val) # ko, ki = s[C].split(k, cfg['tile_k'].val) s[C].reorder(xo, yo, ko, xi, ki, yi) s[C].vectorize(yi) s[C].parallel(xo) s[C].unroll(ki) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) s[packedB].unroll(y) return s, [A, B, C]
def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_height, kernel_width = get_const_tuple(kernel.shape) stride_height, stride_width = stride cfg.stride = stride pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple( padding, (kernel_height, kernel_width)) out_width = (inp_width - 1) * stride_width + \ kernel_width - pad_left - pad_right pad_left = kernel_width - 1 - pad_left pad_right = kernel_width - 1 - pad_right dilated_width = stride_width * (inp_width - 1) + 1 out_height = (inp_height - 1) * stride_height + \ kernel_height - pad_top - pad_bottom pad_top = kernel_height - 1 - pad_top pad_bottom = kernel_height - 1 - pad_bottom dilated_height = stride_height * (inp_height - 1) + 1 # compute pad data = tvm.compute( (batch, inp_channels, pad_top + dilated_height + pad_bottom, pad_left + dilated_width + pad_right), lambda n, c, y, x: tvm.if_then_else( tvm.all(x >= pad_left, x < pad_left + dilated_width, tvm.indexmod(x - pad_left, stride_width).equal(0), y >= pad_top, y < pad_top + dilated_height, tvm.indexmod(y - pad_top, stride_height).equal(0)), data[n, c, tvm.indexdiv(y - pad_top, stride_height), tvm.indexdiv(x - pad_left, stride_width)], tvm.const(0., "float32")), name='data_pad') # compute transposed conv dc = tvm.reduce_axis((0, inp_channels), name='dc') dh = tvm.reduce_axis((0, kernel_height), name='dh') dw = tvm.reduce_axis((0, kernel_width), name='dw') data_out = tvm.compute( (batch, out_channels, out_height, out_width), lambda b, c, h, w: tvm.sum( data[b, dc, h + dh, w + dw].astype(out_dtype) * kernel[dc, c, kernel_height - 1 - dh, kernel_width - 1 - dw].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return data_out
################################################################################################### # Just as it is shown in the figure above, after blocking the computations, we can observe the array # access pattern of B (after flattening), which is regular but discontinuous. We expect that after # some transformation we can get continuous access pattern. We can reorder a [16][16] array to # a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing # the corresponding value from the packed array. # # We have to re-write the algorithm slightly. packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute( (M, N), lambda x, y: tvm.sum( A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C') s = tvm.create_schedule(C.op) xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) k, = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=4) s[C].reorder(xo, yo, ko, xi, ki, yi) s[C].vectorize(yi) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x)
s = tvm.create_schedule(CMatrix.op) f = tvm.build(s, [MatrixA, MatrixB, CMatrix], target=target, name='mmult') c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) eval = f.time_evaluator(f.entry_name, ctx, number=1) print('Without optimizations: %f' % eval(a, b, c).mean) bn = 32 packedMatrixB = tvm.compute((N / bn, K, bn), lambda x, y, z: MatrixB[y, x * bn + z], name='packedMatrixB') CMatrix = tvm.compute( (M, N), lambda x, y: tvm.sum(MatrixA[x, k] * packedMatrixB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='CMatrix') s = tvm.create_schedule(CMatrix.op) newCMatrix = s.cache_write(CMatrix, 'global') xo, yo, xi, yi = s[CMatrix].tile(CMatrix.op.axis[0], CMatrix.op.axis[1], bn, bn) s[newCMatrix].compute_at(s[CMatrix], yo) xc, yc = s[newCMatrix].op.axis k, = s[newCMatrix].op.reduce_axis ko, ki = s[newCMatrix].split(k, factor=4) s[newCMatrix].reorder(ko, xc, ki, yc) s[newCMatrix].unroll(ki) s[newCMatrix].vectorize(yc) #s[CMatrix].parallel(xo)
def fused_convs(input_data, filters, resnet_block=False): out_dtype = input_data.dtype Input = None nodes = [input_data] params = [input_data] for f in filters: Input = nodes[-1] Filter = f.placeholder layout = f.layout depthwise = f.depthwise kernel = f.kernel stride = f.stride padding = f.padding dilation = f.dilation assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise padded_count = 0 conv_count = 0 depthwise_count = 0 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape if f.NHWC_transpose: # HWOI kernel_h, kernel_w, tmp, kernel_channel = Filter.shape else: # HWIO kernel_h, kernel_w, kernel_channel, tmp = Filter.shape if depthwise: channel_multiplier = tmp else: num_filter = tmp # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) if f.kernel > 1: print("Padding is needed!") pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count)) padded_count += 1 nodes.append(PaddedInput) # Update Input Input = PaddedInput batch, in_height, in_width, in_channel = Input.shape if not depthwise: rc = tvm.reduce_axis((0, in_channel), name='rc') if kernel > 1: ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') if not depthwise: # Normal convolution if kernel > 1: Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * (Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") else: # Only reduce rc axis Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) * (Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") conv_count += 1 else: # Depthwise convolution (kernel > 1) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w, tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) * (Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)), axis=[ry, rx]), name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc") depthwise_count += 1 nodes.append(Output) params.append(Filter) if resnet_block: First = nodes[0] Last = nodes[-1] assert (first.shape == last.shape) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (First[b, i, j, c].astype(out_dtype) + (Last[b, i, j, c]).astype(out_dtype))), name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc") nodes.append(Output) params.append(nodes[-1]) # Final output return nodes, params
stmt='answer = numpy.dot(a, b)', number=np_repeat) print("Numpy running time: %f" % (np_runing_time / np_repeat)) answer = numpy.dot(a.asnumpy(), b.asnumpy()) # Algorithm k = tvm.reduce_axis((0, K), 'k') A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') bn = 32 packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name = 'C') s = tvm.create_schedule(C.op) # Allocate write cache CC = s.cache_write(C, 'global') xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) # Write cache is computed at yo s[CC].compute_at(s[C], yo) # New inner axes xc, yc = s[CC].op.axis