def matmul_v3(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() # use a filter that only accepts the split scheme whose inner most tile is less then 4 cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4) cfg.define_split('tile_y', x, policy='factors', filter=lambda x: x.size[-1] <= 4) ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def matmul_v4(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) y, x = s[C].op.axis k = s[C].op.reduce_axis[0] cfg = autotvm.get_config() # define search space #tiling cfg.define_knob("tile_y", [1, 2, 4, 8, 16, 32, 64]) cfg.define_knob("tile_x", [1, 2, 4, 8, 16, 32, 64]) yo, yi = s[C].split(y, cfg['tile_y'].val) xo, xi = s[C].split(x, cfg['tile_x'].val) # cfg.define_split("tile_f", f, num_outputs=4) # cfg.define_split("tile_y", y, num_outputs=4) # cfg.define_split("tile_x", x, num_outputs=4) # cfg.define_split("tile_rc", rc, num_outputs=3) # cfg.define_split("tile_ry", ry, num_outputs=3) # cfg.define_split("tile_rx", rx, num_outputs=3) #reordering cfg.define_reorder("ordering", (yo, xo, k, yi, xi), policy="all") #interval_all cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) #other optimization skills/tricks s[C].vectorize(yi) return s, [A, B, C]
def matmul(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def conv2d_winograd_autotvm(s, ic, oc): cfg = autotvm.get_config() cfg.define_knob('unroll', [1]) cfg.define_knob('compute_at', [0]) cfg.define_knob('vectorize', [1]) cfg.define_knob('tensorize', [1]) cfg.define_knob('VK', [6]) cfg.define_knob('VP', [8]) for intermediate in ["M", "A_T_dot_M", "input_tile", "B_T_dot_X", "V"]: cfg.define_knob("{}_COMPUTE_AT".format(intermediate), [0, 1]) for intermediate in ["input_tile", "V"]: # , "B_T_dot_X", cfg.define_knob("{}_REORDER_C".format(intermediate), [0, 1]) cfg.define_knob('data_pad_inline', [0, 1]) VK = cfg['VK'].val VP = cfg['VP'].val X = tvm.placeholder(shape=(1, ic, s, s), dtype="float32", name="X") W = tvm.placeholder(shape=(oc, ic, 3, 3), dtype="float32", name="W") Y, input_tile, U, output = decl_winograd(cfg, X, W, strides=1, padding=1, layout="NCHW", out_dtype="float32", VK=VK, VP=VP) s = schedule_winograd(cfg, Y, VK=VK, VP=VP) if cfg.flop == 0: cfg.add_flop(2 * ic * oc * s * s * 3 * 3) #print(tvm.lower(s, [X, W, output], simple_mode=True)) return s, [input_tile, U, Y]
def test_workload_padding( self, out_dtype, layout, input_shape, filter_shape, target, ref_data, stride, padding, dilation, ): input_np, filter_np, scale_np, shift_np, output_np = ref_data if layout == "NCHW": _, _, out_height, out_width = output_np.shape elif layout == "NHWC": _, out_height, out_width, _ = output_np.shape elif layout == "NCHWc": _, _, out_height, out_width, _ = output_np.shape Input = te.placeholder(input_shape, name="Input") Filter = te.placeholder(filter_shape, name="Filter") wkl = _get_workload(Input, Filter, (stride, stride), padding, dilation, out_dtype, layout) # check if tile_ow candidates are the factors of the right output weight. with tvm.target.Target(target): cfg = autotvm.get_config() _fallback_schedule(cfg, wkl) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width)
def pixelcnn_autotvm(N, H, W, CO, CI, KH, KW, mask_type, bias, stride, padding, dilation): # assert N == 1, "Only consider batch_size = 1 in this template" # data = tvm.te.placeholder((N, CI, H, W), name='data') # kernel = tvm.te.placeholder((CO, CI, KH, KW), name='kernel') # conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=dilation, out_dtype='float32') convop, tensors = pixelcnn(N, H, W, CI, CO, KH, KW, mask_type, bias=bias, stride=stride, padding=padding, dilation=dilation) s = tvm.te.create_schedule(convop) cfg = autotvm.get_config() ##### space definition begin ##### schedule_direct_cuda(cfg, s, *tensors[-2:]) return s, [*tensors]
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_config_define(N, K, M, dtype): A = tvm.placeholder((N, K), name='A', dtype=dtype) B = tvm.placeholder((K, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, K), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) k = s[C].op.reduce_axis[0] y, x = s[C].op.axis cfg = autotvm.get_config() cfg.define_split("tile_x", x, num_outputs=2) cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_k", k, num_outputs=2) # cfg.define_split('tile_x', x, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_split('tile_k', k, policy='factors', filter=lambda x: x.size[-1] <= 64) xo, xi = cfg["tile_x"].apply(s, C, x) yo, yi = cfg["tile_y"].apply(s, C, y) ko, ki = cfg["tile_k"].apply(s, C, k) # cfg.define_knob("tile_x", [1, 4, 8, 16, 32, 64]) # cfg.define_knob("tile_y", [1, 4, 8, 16, 32, 64]) # cfg.define_knob("tile_k", [1, 4, 8, 16, 32, 64]) # xo, xi = s[C].split(x, cfg['tile_x'].val) # yo, yi = s[C].split(y, cfg['tile_y'].val) # ko, ki = s[C].split(k, cfg['tile_k'].val) s[C].reorder(xo, yo, ko, xi, ki, yi) s[C].vectorize(yi) s[C].parallel(xo) s[C].unroll(ki) return s, [A, B, C]
def matmul_v1(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) cfg.define_annotate("unroll", policy="try_unroll") #policy (str) – name of policy If is ‘unroll’, unroll the axes. # If is ‘try_unroll’, try to unroll the axes. # If is ‘try_unroll_vec’, try to unroll or vectorize the axes. # If is ‘bind_gpu’, bind the first few axes to gpu threads. # If is ‘locate_cache’, choose n axes to attach shared/local cache. cfg.define_annotate("vec", policy="try_unroll_vec") cfg.define_annotate("cache", policy="locate_cache") #policy (str) – name of policy If is ‘identity’, do an identity permutation. # If is ‘all’, try all permutations. # If is ‘interval_all’, try all permutations of an interval of axes. # If is ‘candidate’, try listed candidate. # If is ‘interleave’, interleave chains of spatial axes and chains of reduction axes. cfg.define_reorder("reorder", policy="interval_all") ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def my_tune(n): #tx, ty, tk = 64, 64, 16 # tile sizes cfg = autotvm.get_config() cfg.define_knob("tile_x", [16, 64, 128]) cfg.define_knob("tile_y", [16, 64, 128]) cfg.define_knob("tile_k", [4, 16, 64, 128]) tx = cfg["tile_x"].val ty = cfg["tile_y"].val tk = cfg["tile_k"].val A, B, C = matmul(n, n, n) s = te.create_schedule(C.op) # Create a write cache for C CachedC = s.cache_write(C, 'local') # Same as before, first tile by blocks, and then parallelize the # computation of each block xo, yo, xi, yi = s[C].tile(*C.op.axis, tx, ty) xy = s[C].fuse(xo, yo) s[C].parallel(xy) # Use the write cache for the output of the xy axis, namely a block. s[CachedC].compute_at(s[C], xy) # Same as before to optimize the computation of a block . xc, yc = s[CachedC].op.axis ko, ki = s[CachedC].split(CachedC.op.reduce_axis[0], factor=tk) s[CachedC].reorder(ko, xc, ki, yc) s[CachedC].unroll(ki) s[CachedC].vectorize(yc) return s, [A, B, C]
def matmul_v1(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] # 2. get the config object cfg = autotvm.get_config() # 3. define search space cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) # 4. schedule according to config yo, yi = s[C].split(y, cfg['tile_y'].val) xo, xi = s[C].split(x, cfg['tile_x'].val) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def softmax_naive(a, b, c, d): A = tvm.placeholder((a, b, c, d), dtype='float32', name='A') B = topi.nn.softmax(A, axis=1) s = tvm.create_schedule([B.op]) Passes.enable_autotune(s, [B], autotvm.get_config(), mode=Passes.NAIVE) return s, [A, B]
def gemm_v1(M, N, K, dtype): A = te.placeholder((M, K), name='A', dtype=dtype) B = te.placeholder((K, N), name='B', dtype=dtype) # compute k = te.reduce_axis((0, K), name='k') C = te.compute((M, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C') # schedule s = te.create_schedule(C.op) y, x = s[C].op.axis k = s[C].op.reduce_axis[0] # define search space cfg = autotvm.get_config() cfg.define_split("tile_x", x, num_outputs=2) cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_k", k, num_outputs=2) # apply config xo, xi = cfg["tile_x"].apply(s, C, x) yo, yi = cfg["tile_y"].apply(s, C, y) ko, ki = cfg["tile_k"].apply(s, C, k) # define order # cfg.define_reorder("reorder", [yo, xo, ko, yi, ki, xi], "all") # cfg["reorder"].apply(s, C, [yo, xo, ko, yi, ki, xi]) # other s[C].reorder(yo, xo, ko, yi, ki, xi) s[C].vectorize(xi) s[C].unroll(ki) s[C].parallel(xo) return s, [A, B, C]
def Gemm_tv2_reorder2_3_vec1_para1_config_define(N, K, M, dtype): A = tvm.placeholder((N, K), name='A', dtype=dtype) B = tvm.placeholder((K, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, K), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) k = s[C].op.reduce_axis[0] y, x = s[C].op.axis cfg = autotvm.get_config() cfg.define_split("tile_x", x, num_outputs=2) cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_k", k, num_outputs=2) # cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4,8], [4, 1, 4,8]]) # cfg.define_split('tile_y', y, policy='candidate', candidate=[[1, 4, 4,8], [4, 1, 4,8]]) # cfg.define_split('tile_k', k, policy='candidate', candidate=[[1, 4, 4,8], [4, 1, 4,8]]) xo, xi = cfg["tile_x"].apply(s, C, x) yo, yi = cfg["tile_y"].apply(s, C, y) ko, ki = cfg["tile_k"].apply(s, C, k) print(xo) s[C].reorder(xo, yo, ko, xi, ki, yi) s[C].vectorize(yi) s[C].parallel(xo) return s, [A, B, C]
def test_workload_padding( self, target, input_shape, weight_shape, stride, padding, dilation, dtype, ref_data, ): a_np, w_np, b_np, c_np = ref_data _, _, out_height, out_width = c_np.shape A = te.placeholder(input_shape, name="A", dtype=dtype) W = te.placeholder(weight_shape, name="W", dtype=dtype) with tvm.target.Target(target): wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) # check if tile_ow candidates are the factors of the right output weight. cfg = autotvm.get_config() _fallback_schedule(cfg, wkl) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width)
def matmul(N, L, M, dtype): A = te.placeholder((N, L), name="A", dtype=dtype) B = te.placeholder((L, M), name="B", dtype=dtype) k = te.reduce_axis((0, L), name="k") C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") s = te.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) # Make sure configurations have a varied number of itervars. Splitting adds # new itervars, so conditionally splitting with cause the number of # itervars to depend on the tile size. if cfg["tile_x"].size[-1] > 1: xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) else: s[C].reorder(yo, k, yi, x) return s, [A, B, C]
def output_transform_autotvm(dtype): cfg = autotvm.get_config() cfg.define_knob('VK', [2, 4, 8, 16]) cfg.define_knob('VP', [4, 8, 16]) VK = cfg['VK'].val VP = cfg['VP'].val X = tvm.placeholder(shape=(1, 64, 56, 56), dtype="float32", name="X") W = tvm.placeholder(shape=(64, 64, 56, 56), dtype="float32", name="W") N = get_const_int(X.shape[0]) IH = get_const_int(X.shape[2]) IW = get_const_int(X.shape[3]) OH = get_const_int((IH + 2 * HPAD - 3) // HSTR + 1) OW = get_const_int((IW + 2 * WPAD - 3) // WSTR + 1) nH, nW = get_const_int((OH + m - 1) // m), get_const_int((OW + m - 1) // m) def round_up(a, b): return ((a + b - 1) // b) * b P = round_up(N * nH * nW, VP) K = get_const_int(W.shape[0]) assert K % VK == 0 assert P % VP == 0 cfg.define_knob('use_minimal', [1]) M = tvm.placeholder(shape=(K // VK, P // VP, alpha, alpha, VK, VP), name="M") if cfg['use_minimal'].val: output = decl_output_transform_minimal(cfg, X, M, VK, VP) else: output = decl_output_transform(cfg, X, M, VK, VP) s = schedule_output_transform(cfg, output) #print(tvm.lower(s, [X, M, output], simple_mode=True)) return s, [X, M, output]
def Gemm_tv2_reorder2_3_vec1_para1_config_define(N, K, M, dtype): A = tvm.placeholder((N, K), name='A', dtype=dtype) B = tvm.placeholder((K, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, K), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) k = s[C].op.reduce_axis[0] y, x = s[C].op.axis cfg = autotvm.get_config() cfg.define_split("tile_x", x , num_outputs=2) cfg.define_split("tile_y", y , num_outputs=2) cfg.define_split("tile_k", k , num_outputs=2) ''' >>> # use custom candidates >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])''' #>>> # use a filter that only accepts the split scheme whose inner most tile is less then 64 # cfg.define_split('tile_x', x, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_split('tile_k', k, policy='factors', filter=lambda x: x.size[-1] <= 64) # cfg.define_knob("tile_x", [1, 4, 8, 16, 32, 64]) # cfg.define_knob("tile_y", [1, 4, 8, 16, 32, 64]) # cfg.define_knob("tile_k", [1, 4, 8, 16, 32, 64]) # xo, xi = s[C].split(x, cfg['tile_x'].val) # yo, yi = s[C].split(y, cfg['tile_y'].val) # ko, ki = s[C].split(k, cfg['tile_k'].val) xo, xi = cfg["tile_x"].apply(s, C, x) yo, yi = cfg["tile_y"].apply(s, C, y) ko, ki = cfg["tile_k"].apply(s, C, k) s[C].reorder(xo,yo,ko,xi,ki,yi) s[C].vectorize(yi) s[C].parallel(xo) return s, [A, B, C]
def CONVAutoTVM(*args): global function def getSplit(maxNum): splitList = [] para = 2 while (True): if para < maxNum / 4 and para <= 32: splitList.append(para) para *= 2 else: break if len(splitList) == 0: splitList.append(1) return splitList ops, bufs = function(*args) s = tvm.create_schedule(ops) # get bias_tensor, conv_tensor, pad_tensor and their ops relatively bias_tensor = None conv_tensor = None pad_tensor = None conv_tensor = bufs[len(bufs) - 1] in_tensor2 = conv_tensor.op.input_tensors[1] in_tensor1 = conv_tensor.op.input_tensors[0] if in_tensor2.op.name == "bias": bias_tensor = conv_tensor conv_tensor = in_tensor1 in_tensor1 = conv_tensor.op.input_tensors[0] pad_tensor = in_tensor1 if bias_tensor != None: bias_op = s[bias_tensor] conv_op = s[conv_tensor] pad_op = s[pad_tensor] oc = conv_op.op.axis[1] x = conv_op.op.axis[2] y = conv_op.op.axis[3] ic = conv_op.op.reduce_axis[0] kh = conv_op.op.reduce_axis[1] kw = conv_op.op.reduce_axis[2] cfg = autotvm.get_config() cfg.define_knob("split_oc", getSplit(int(oc.dom.extent))) cfg.define_knob("split_x", getSplit(int(x.dom.extent))) cfg.define_knob("split_y", getSplit(int(y.dom.extent))) cfg.define_knob("split_ic", getSplit(int(ic.dom.extent))) oco, oci = conv_op.split(oc, cfg["split_oc"].val) xo, xi = conv_op.split(x, cfg["split_x"].val) yo, yi = conv_op.split(y, cfg["split_y"].val) ico, ici = conv_op.split(ic, cfg["split_ic"].val) conv_op.reorder(oco, ico, xo, yo, oci, ici, kh, kw, xi, yi) cfg.define_annotate("yi_unroll", [yi], policy='try_unroll') pad_op.compute_inline() return s, bufs
def matmul(): # Algorithm k = te.reduce_axis((0, K), 'k') A = te.placeholder((M, K), name='A') B = te.placeholder((K, N), name='B') ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_x", M, num_outputs=3) cfg.define_split("tile_y", N, num_outputs=3) cfg.define_split("tile_k", K, num_outputs=2) ##### define space end ##### # We have to re-write the algorithm slightly. bn = cfg["tile_y"].size[-1] packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') C = te.compute( (M, N), lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, y % bn], axis=k), name='C') s = te.create_schedule(C.op) x, y = s[C].op.axis k, = s[C].op.reduce_axis # schedule according to config # Allocate write cache CC = s.cache_write(C, 'global') xt, xo, xi = cfg["tile_x"].apply(s, C, x) yt, yo, yi = cfg["tile_y"].apply(s, C, y) s[C].reorder(xt, yt, xo, yo, xi, yi) xyt = s[C].fuse(xt, yt) # parallel s[C].parallel(xyt) xyo = s[C].fuse(xo, yo) s[C].unroll(xi) s[C].vectorize(yi) # Write cache is computed at xyo s[CC].compute_at(s[C], xyt) # New inner axes xc, yc = s[CC].op.axis k, = s[CC].op.reduce_axis ko, ki = cfg["tile_k"].apply(s, CC, k) s[CC].reorder(ko, xc) code = tvm.lower(s, [A, B, C], simple_mode=True) cfg.define_reorder("reorder", [ko, xc, ki, yc], "all") cfg["reorder"].apply(s, CC, [ko, xc, ki, yc]) cfg.define_annotate('ann', [ko, xc, ki, yc], policy='try_unroll_vec') cfg['ann'].apply(s, CC, [ko, xc, ki, yc]) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) return s, [A, B, C]
def gemm(M, N, K): A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') k = tvm.reduce_axis((0, K), 'k') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k), name='C') s = tvm.create_schedule([C.op]) cfg = autotvm.get_config() local_C = s.cache_write(C, "local") #schedule C h_C, w_C = s[C].op.axis cfg.define_split("h_C", h_C, num_outputs=4) cfg.define_split("w_C", w_C, num_outputs=4) bh, vth, th, h = cfg["h_C"].apply(s, C, h_C) bw, vtw, tw, w = cfg["w_C"].apply(s, C, w_C) s[C].bind(bh, tvm.thread_axis("blockIdx.x")) s[C].bind(bw, tvm.thread_axis("blockIdx.y")) s[C].bind(vth, tvm.thread_axis("vthread")) s[C].bind(vtw, tvm.thread_axis("vthread")) s[C].bind(th, tvm.thread_axis("threadIdx.x")) s[C].bind(tw, tvm.thread_axis("threadIdx.y")) s[C].reorder(bh, bw, vth, vtw, th, tw, h, w) #schedule local_C s[local_C].compute_at(s[C], tw) hi, wi = s[local_C].op.axis rk = s[local_C].op.reduce_axis[0] cfg.define_split("rk", rk, num_outputs=2) rko, rki = cfg["rk"].apply(s, local_C, rk) s[local_C].reorder(rko, rki, hi, wi) #schedule share_A and share_B share_A = s.cache_read(A, 'shared', local_C) s[share_A].compute_at(s[local_C], rko) sh_h, sh_w = s[share_A].op.axis th, sh_h = s[share_A].split(sh_h, nparts=cfg["h_C"].size[2]) tw, sh_w = s[share_A].split(sh_w, nparts=cfg["w_C"].size[2]) s[share_A].bind(th, tvm.thread_axis("threadIdx.x")) s[share_A].bind(tw, tvm.thread_axis("threadIdx.y")) share_B = s.cache_read(B, "shared", local_C) s[share_B].compute_at(s[local_C], rko) sh_h, sh_w = s[share_B].op.axis th, sh_h = s[share_B].split(sh_h, nparts=cfg["h_C"].size[2]) tw, sh_w = s[share_B].split(sh_w, nparts=cfg["w_C"].size[2]) s[share_B].bind(th, tvm.thread_axis("threadIdx.x")) s[share_B].bind(tw, tvm.thread_axis("threadIdx.y")) #schedule local_A and local_B local_A = s.cache_read(share_A, "local", local_C) s[local_A].compute_at(s[local_C], rki) local_B = s.cache_read(share_B, "local", local_C) s[local_B].compute_at(s[local_C], rki) return s, [A, B, C]
def conv2d_NCHWc_KCRSk(input_shape, filter_shape): data = te.placeholder(input_shape, name="data", dtype="float32") filt = te.placeholder(filter_shape, name="filter", dtype="float32") conv = compute_conv2d_NCHWc_KCRSk(data, filt, [1, 1], [0, 0], [1, 1], "float32") cfg = autotvm.get_config() s = te.create_schedule([x.op for x in [conv]]) schedule_conv2d_NCHWc_KCRSk(cfg, s, conv) return s, (data, filt, conv)
def depthwise_conv2d_NCHWc_KCRSk_acc32(input_shape, filter_shape): data = te.placeholder(input_shape, name="data", dtype="float32") filt = te.placeholder(filter_shape, name="filter", dtype="float32") output = compute_depthwise_conv2d_NCHWc_KCRSk_acc32( data, filt, [1, 1], [0, 0], [1, 1], "float32") cfg = autotvm.get_config() s = te.create_schedule([x.op for x in [output]]) schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output) return s, (data, filt, output)
def bgemm_topi(Y, X, K, dtype="uint64"): DB = 1 WB = 1 out_dtype = dtype data_packed = tvm.placeholder((Y, DB, K), dtype=dtype, name="A") weight_packed = tvm.placeholder((X, WB, K), dtype=dtype, name="B") oshape = (Y, X) k = tvm.reduce_axis((0, K), name='k') db = tvm.reduce_axis((0, DB), name='db') wb = tvm.reduce_axis((0, WB), name='wb') matmul = tvm.compute( oshape, lambda i, j: tvm.sum(tvm.popcount(weight_packed[ j, wb, k] & data_packed[i, db, k]).astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense') s = tvm.create_schedule(matmul.op) cfg = autotvm.get_config() CC = s.cache_write(matmul, "global") y, x = s[matmul].op.axis yo, yi = cfg.define_split("tile_y", y, num_outputs=2, filter=lambda x: x.size[-1] <= 8) xo, xi = cfg.define_split("tile_x", x, num_outputs=2, filter=lambda x: x.size[-1] <= 8) yo, yi = cfg["tile_y"].apply(s, matmul, y) xo, xi = cfg["tile_x"].apply(s, matmul, x) s[matmul].reorder(yo, xo, yi, xi) cfg.define_knob("compute_at_axis", [0, 1, 2]) if cfg["compute_at_axis"].val == 0: s[CC].compute_at(s[matmul], xo) elif cfg["compute_at_axis"].val == 1: s[CC].compute_at(s[matmul], yi) elif cfg["compute_at_axis"].val == 2: s[CC].compute_at(s[matmul], xi) yc, xc = s[CC].op.axis wb, db, k = s[CC].op.reduce_axis cfg.define_reorder("reorder_0", [k, yc, xc], policy="all") cfg["reorder_0"].apply(s, CC, [k, yc, xc]) cfg.add_flop(2 * Y * X * K * int(dtype[4:])) return s, [data_packed, weight_packed, matmul]
def verify_workload_padding(): _, _, out_height, out_width = get_const_tuple(c_np.shape) wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) # check if tile_ow candidates are the factors of the right output weight. cfg = autotvm.get_config() _fallback_schedule(cfg, wkl) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width)
def conv2d_channel_batch(B, N, M, C, K, L, O, stride=1, padding=0, dtype="float32"): A = tvm.placeholder((B, N, M, C), dtype=dtype, name="A") W = tvm.placeholder((K, L, C, O), dtype=dtype, name="W") N_out = max(0, (N + padding * 2 - K) // stride) + 1 M_out = max(0, (M + padding * 2 - L) // stride) + 1 Apad = tvm.compute( (B, N + 2 * padding, M + 2 * padding, C), lambda b, i, j, k: tvm.if_then_else( tvm.all(i >= padding, j >= padding, i < N + padding, j < M + padding), A[b, i - padding, j - padding, k], 0.0), name="Apad") rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L), name="ry") rc = tvm.reduce_axis((0, C), name="rc") Output = tvm.compute( (B, N_out, M_out, O), lambda b, i, j, k: tvm.sum(Apad[b, i * stride + rx, j * stride + ry, rc ] * W[rx, ry, rc, k], axis=[rx, ry, rc]), name="Output") s = tvm.create_schedule(Output.op) s[Apad].compute_inline() CL = s.cache_write(Output, "local") n, h, w, c = s[Output].op.axis out = s[Output].fuse(h, w) cfg = autotvm.get_config() cfg.define_split("split_n", n, num_outputs=2) cfg.define_split("split_c", c, num_outputs=2) no, ni = cfg["split_n"].apply(s, Output, n) co, ci = cfg["split_c"].apply(s, Output, c) s[Output].reorder(no, out, co, ni, ci) s[Output].parallel(out) # schedule CL s[CL].compute_at(s[Output], co) ni, hi, wi, ci = s[CL].op.axis xi, yi, ki = s[CL].op.reduce_axis cfg.define_split("split_k", ki, num_outputs=2) ko, ki = cfg["split_k"].apply(s, CL, ki) s[CL].reorder(ko, xi, yi, ni, ki, ci) s[CL].unroll(ki) s[CL].vectorize(ci) return s, [A, W, Output]
def conv2d_turning(*args): global function ops, bufs = function(*args) data, kernel, conv = bufs s = tvm.create_schedule(conv.op) ##### space definition begin ##### n, f, y, x = s[conv].op.axis rc, ry, rx = s[conv].op.reduce_axis cfg = autotvm.get_config() cfg.define_split("tile_f", f, num_outputs=4) cfg.define_split("tile_y", y, num_outputs=4) cfg.define_split("tile_x", x, num_outputs=4) cfg.define_split("tile_rc", rc, num_outputs=3) cfg.define_split("tile_ry", ry, num_outputs=3) cfg.define_split("tile_rx", rx, num_outputs=3) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) ##### space definition end ##### # inline padding pad_data = s[conv].op.input_tensors[0] s[pad_data].compute_inline() data, raw_data = pad_data, data output = conv OL = s.cache_write(conv, 'global') # tile and bind spatial axes n, f, y, x = s[output].op.axis bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) kernel_scope = n # this is the scope to attach global config inside this kernel s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) fuse = s[output].fuse(yi, xi) s[output].vectorize(fuse) if (args[0] == 4 and args[1] == 112 and args[2] == 14 and args[4] == 224): s[output].unroll(fi) s[OL].compute_at(s[output], tx) # tile reduction axes n, f, y, x = s[OL].op.axis rc, ry, rx = s[OL].op.reduce_axis rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc) ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry) rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx) s[OL].reorder(n, rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, f, y, x) n, f, y, x = s[conv].op.axis s[conv].parallel(n) #print(tvm.lower(s, [data, kernel, conv], simple_mode=True)) return s, [raw_data, kernel, conv]
def vectoradd_naive(K,dtype): A = tvm.placeholder((K,), name='A', dtype=dtype) B = tvm.placeholder((K,), name='B', dtype=dtype) C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C') s = tvm.create_schedule(C.op) #### ADDED AUTO AUTOTUNING #### Passes.enable_autotune(s,[C],autotvm.get_config(),mode=Passes.NAIVE) ############################### return s, [A, B, C]
def block_sparse_template(W_sp_np_data_shape, W_sp_np_indices_shape, W_sp_np_indptr_shape, X_np_shape): W_data = te.placeholder(shape=W_sp_np_data_shape, dtype='float32', name='W_data') W_indices = te.placeholder(shape=W_sp_np_indices_shape, dtype='int32', name='W_indices') W_indptr = te.placeholder(shape=W_sp_np_indptr_shape, dtype='int32', name='W_indptr') X = te.placeholder(shape=X_np_shape, dtype='float32', name='X') Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr) cfg = autotvm.get_config() cfg.add_flop(W_sp_np_data_shape[0] * X_np_shape[0] * W_sp_np_data_shape[1] * W_sp_np_data_shape[2] * 2) s = schedule_sparse_dense_cuda_allreduce_autotune(cfg, [Y]) return s, [X, W_data, W_indices, W_indptr, Y]
def conv2d(data, kernel, *args): cfg = autotvm.get_config() out = conv2d_NCHWc_int8(cfg, data, kernel, *args) s = tvm.create_schedule(out.op) s = schedule_conv2d_NCHWc_int8(cfg, s, out) fadd = tvm.build(s, [data, kernel, out], 'cuda', name="conv") dev_module = fadd.imported_modules[0] print("-----GPU code-----") print(dev_module.get_source()) print(s) return s, [data, kernel, out]
def relu_naive(m, n): A = tvm.placeholder((m, n), name='A') B = topi.nn.relu(A) with tvm.target.create('llvm'): s = tvm.create_schedule(B.op) cfg = autotvm.get_config() Passes.enable_autotune(s, [B], cfg, mode=Passes.NAIVE) return s, [A, B]
def bad_matmul(N, L, M, dtype): if 'bad_device' in tvm.target.current_target().keys: A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L-1), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) return s, [A, B, C] return matmul(N, L, M, dtype)
def conv2d_no_batching(N, H, W, CI, CO, KH, KW): """An example template for testing""" assert N == 1, "Only consider batch_size = 1 in this template" data = tvm.placeholder((N, CI, H, W), name='data') kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel') rc = tvm.reduce_axis((0, CI), name='rc') ry = tvm.reduce_axis((0, KH), name='ry') rx = tvm.reduce_axis((0, KW), name='rx') conv = tvm.compute( (N, CO, H - KH + 1, W - KW + 1), lambda nn, ff, yy, xx: tvm.sum( data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx]), tag="conv2d_nchw") s = tvm.create_schedule([conv.op]) output = conv OL = s.cache_write(conv, 'local') # create cache stage AA = s.cache_read(data, 'shared', [OL]) WW = s.cache_read(kernel, 'shared', [OL]) AL = s.cache_read(AA, 'local', [OL]) WL = s.cache_read(WW, 'local', [OL]) # tile and bind spatial axes n, f, y, x = s[output].op.axis cfg = autotvm.get_config() cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) kernel_scope = n # this is the scope to attach global config inside this kernel s[output].bind(bf, tvm.thread_axis("blockIdx.z")) s[output].bind(by, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tf, tvm.thread_axis("threadIdx.z")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) s[OL].compute_at(s[output], tx) # tile and bind reduction axes n, f, y, x = s[OL].op.axis rc, ry, rx = s[OL].op.reduce_axis cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3) cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3) rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc) ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry) rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx) s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) s[AA].compute_at(s[OL], rxo) s[WW].compute_at(s[OL], rxo) s[AL].compute_at(s[OL], rxm) s[WL].compute_at(s[OL], rxm) # cooperative fetching for load in [AA, WW]: n, f, y, x = s[load].op.axis fused = s[load].fuse(n, f, y, x) tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) s[load].bind(tz, tvm.thread_axis("threadIdx.z")) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) # tune unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) return s, [data, kernel, conv]
def simple_template(a, b): cfg = autotvm.get_config() assert cfg.is_fallback
def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): assert N == 1, "Only consider batch_size = 1 in this template" data = tvm.placeholder((N, CI, H, W), name='data') kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel') conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32') s = tvm.create_schedule([conv.op]) ##### space definition begin ##### n, f, y, x = s[conv].op.axis rc, ry, rx = s[conv].op.reduce_axis cfg = autotvm.get_config() cfg.define_split("tile_f", f, num_outputs=4) cfg.define_split("tile_y", y, num_outputs=4) cfg.define_split("tile_x", x, num_outputs=4) cfg.define_split("tile_rc", rc, num_outputs=3) cfg.define_split("tile_ry", ry, num_outputs=3) cfg.define_split("tile_rx", rx, num_outputs=3) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) ##### space definition end ##### # inline padding pad_data = s[conv].op.input_tensors[0] s[pad_data].compute_inline() data, raw_data = pad_data, data output = conv OL = s.cache_write(conv, 'local') # create cache stage AA = s.cache_read(data, 'shared', [OL]) WW = s.cache_read(kernel, 'shared', [OL]) AL = s.cache_read(AA, 'local', [OL]) WL = s.cache_read(WW, 'local', [OL]) # tile and bind spatial axes n, f, y, x = s[output].op.axis bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) kernel_scope = n # this is the scope to attach global config inside this kernel s[output].bind(bf, tvm.thread_axis("blockIdx.z")) s[output].bind(by, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tf, tvm.thread_axis("threadIdx.z")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) s[OL].compute_at(s[output], tx) # tile reduction axes n, f, y, x = s[OL].op.axis rc, ry, rx = s[OL].op.reduce_axis rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc) ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry) rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx) s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) s[AA].compute_at(s[OL], rxo) s[WW].compute_at(s[OL], rxo) s[AL].compute_at(s[OL], rxm) s[WL].compute_at(s[OL], rxm) # cooperative fetching for load in [AA, WW]: n, f, y, x = s[load].op.axis fused = s[load].fuse(n, f, y, x) tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) s[load].bind(tz, tvm.thread_axis("threadIdx.z")) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) # tune unroll s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) return s, [raw_data, kernel, conv]