def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """Hint: output shape should be (1,)""" h, w = shape x = tvm.placeholder(shape, dtype, name="x") k1 = tvm.reduce_axis((0, w), name="k1") x2 = tvm.compute((h, ), lambda i: tvm.max(x[i, k1], axis=k1)) x3 = tvm.compute(shape, lambda i, j: x[i, j] - x2[i]) x4 = tvm.compute(shape, lambda i, j: tvm.exp(x3[i, j])) k2 = tvm.reduce_axis((0, w), name="k2") x5 = tvm.compute((h, ), lambda i: tvm.sum(x4[i, k2], axis=k2)) log_p = tvm.compute(shape, lambda i, j: tvm.log(x4[i, j] / x5[i])) q = tvm.placeholder(shape, dtype, name="q") k3 = tvm.reduce_axis((0, w), name="k3") y1 = tvm.compute((h, ), lambda i: tvm.sum(q[i, k3] * log_p[i, k3], axis=k3)) k4 = tvm.reduce_axis((0, h), name="k4") y2 = tvm.compute((1, ), lambda i: tvm.sum(-y1[k4], axis=k4)) y3 = tvm.compute((1, ), lambda i: y2[i] / h) s = tvm.create_schedule(y3.op) f = tvm.build(s, [x, q, y3], tgt, target_host=tgt_host, name=func_name) return f
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" assert len(shape) == 2 x = tvm.placeholder(shape, dtype=dtype, name='x') y = tvm.placeholder(shape, dtype=dtype, name='y') n, cls_num = shape max_iter = tvm.reduce_axis((0, cls_num), name='max_iter') x_max = tvm.compute((n,), lambda i : tvm.max(x[i, max_iter], axis=max_iter)) e_x = tvm.compute(shape, lambda i,j : tvm.exp(x[i,j] - x_max[i])) sum_ex_iter = tvm.reduce_axis((0, cls_num), name='sum_ex_iter') e_x_sum = tvm.compute((n,), lambda i : tvm.sum(e_x[i, sum_ex_iter], axis=sum_ex_iter)) log_softmax = tvm.compute(shape, lambda i,j : tvm.log(e_x[i,j] /e_x_sum[i])) y_mul_log_softmax = tvm.compute(shape, lambda i,j: log_softmax[i,j] * y[i,j]) sum_entropy_iter = tvm.reduce_axis((0, cls_num), name='sum_entropy_iter') mean_iter = tvm.reduce_axis((0, n), name='mean_iter') sum_entropy = tvm.compute((1,), lambda i:tvm.sum(y_mul_log_softmax[mean_iter, sum_entropy_iter], axis=[mean_iter, sum_entropy_iter])) scale = tvm.const(-n, dtype) entropy = tvm.compute((1,), lambda i: sum_entropy[i]/scale) s = tvm.create_schedule(entropy.op) f = tvm.build(s, [x, y, entropy], tgt, target_host=tgt_host, name=func_name) return f
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" X = tvm.placeholder(shape, dtype=dtype, name="X") _, _, _, SOFTMAX = softmax_computation(X, shape, dtype) # cross_entropy = np.mean( # -np.sum(y_ * np.log(autodiff.softmax_func(y)), axis=1), keepdims=True) Y = tvm.placeholder(shape, dtype=dtype, name="Y") MUL_LOG = tvm.compute(shape, lambda *i: Y(*i) * tvm.log(SOFTMAX(*i)), name="MUL_LOG") k1 = tvm.reduce_axis((0, shape[1]), name="k1") SUM1 = tvm.compute((shape[0],), lambda i: tvm.sum(-MUL_LOG(i, k1), axis=k1), name="SUM1") k2 = tvm.reduce_axis((0, shape[0]), name="k2") SUM2 = tvm.compute((1,), lambda _: tvm.sum(SUM1(k2), axis=k2), name="SUM2") OUT = tvm.compute((1,), lambda i: SUM2(i) / shape[0]) # schedule s = tvm.create_schedule(OUT.op) # compile f = tvm.build(s, [X, Y, OUT], tgt, target_host=tgt_host, name=func_name) return f
def test_log_pow_llvm(): # graph n = tvm.var('n') A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: tvm.power(tvm.log(A(*i)), 2.0), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. bx, tx = s[B].split(B.op.axis[0], factor=32) # one line to build the function. if not tvm.module.enabled("llvm"): return flog = tvm.build(s, [A, B], "llvm", name="mylog") ctx = tvm.cpu(0) # launch the kernel. n = 1028 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) repeat = 10 ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat) res = ftimer(a, b) assert (len(res.results) == repeat) np.testing.assert_allclose(b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
def test_log_pow_llvm(): # graph n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.power(tvm.log(A(*i)), 2.0), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. bx, tx = s[B].split(B.op.axis[0], factor=32) # one line to build the function. if not tvm.module.enabled("llvm"): return flog = tvm.build(s, [A, B], "llvm", name="mylog") ctx = tvm.cpu(0) # launch the kernel. n = 1028 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) repeat = 10 ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat) res = ftimer(a, b) assert(len(res.results) == repeat) np.testing.assert_allclose( b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
def test_1(): m = 64 n = 128 shape = (64, 128) A = tvm.placeholder(shape, name="A") C = tvm.compute(shape, lambda *indice: tvm.log(A(*indice)), name="C") s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, C], simple_mode=True)) pass
def log(x): """Take logarithm of input x. Parameters ---------- x : tvm.Tensor Input argument. Returns ------- y : tvm.Tensor The result. """ return tvm.compute(x.shape, lambda *i: tvm.log(x(*i)))
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): X = tvm.placeholder(shape, dtype=dtype, name="X") Y_orig = tvm.placeholder(shape, dtype=dtype, name="Y_orig") j1 = tvm.reduce_axis((0, shape[1]), "j1") j2 = tvm.reduce_axis((0, shape[1]), "j2") maxX = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, j1], axis=j1), name="maxX") numerator = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - maxX[i]), name="numerator") denominator = tvm.compute((shape[0], ), lambda i: tvm.sum(numerator[i, j2], axis=j2), name="denominator") m1 = tvm.reduce_axis((0, shape[0]), "m1") m2 = tvm.reduce_axis((0, shape[1]), "m2") cross_entropy_sum = tvm.compute((1, ), lambda i: tvm.sum(Y_orig[m1, m2] * tvm.log( numerator[m1, m2] / denominator[m1]), axis=[m1, m2]), name="cross_entropy_sum") negated = tvm.compute((1, ), lambda i: -cross_entropy_sum[i] / shape[0], name="negated") s = tvm.create_schedule(negated.op) # print(tvm.lower(s, [X, Y_orig, negated], simple_mode=True)) return tvm.build(s, [X, Y_orig, negated], tgt, target_host=tgt_host, name=func_name)
def algorithm_forward(self): assert self.x.size == self.t.size, \ "only supports one-hot vector" self.softmax, self.max_elem, self.expsum = Softmax.forward_( self.x, name=f'{self.label}') if self.x.ndim == 1: y = topi.reshape(self.softmax, (1, self.x.size)) t = topi.reshape(self.t.tvm_tensor, (1, self.t.size)) m = 1 n = self.x.shape[0] elif self.x.ndim == 2: y = self.softmax t = self.t.tvm_tensor m, n = self.x.shape else: raise NotImplementedError self.ty = tvm.compute((m, n), lambda i, j: tvm.log(y[i, j]) * t[i, j], name=f'{self.label}:ty') k = tvm.reduce_axis((0, n), name='k') self.sum_ty = tvm.compute((m, ), lambda i: tvm.sum(self.ty[i, k], axis=k), name=f'{self.label}:sum_ty') # TODO: need to validate the shape and keepdims # self.shape would be like (1,1,1), which size is 1 expected_size = 1 assert self.size == expected_size, \ f'size of SoftmaxWithCrossEntropyLoss must be {expected_size}, not {self.size}' k = tvm.reduce_axis((0, m), name='k') self.total = tvm.compute(self.shape, lambda *idxs: tvm.sum(self.sum_ty[k], axis=k), name=f'{self.label}:total') self.tvm_tensor = tvm.compute(self.shape, lambda *idxs: -self.total[idxs] / m, name=f'{self.label}:tensor')
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" A = tvm.placeholder(shape, dtype=dtype, name="A") B = tvm.placeholder(shape, dtype=dtype, name="B") k = tvm.reduce_axis((0, shape[1]), name="k") max_A = tvm.compute((shape[0], ), lambda i: tvm.max(A[i, k], axis=k), name="max_A") exp = tvm.compute(shape, lambda i, j: tvm.exp(A[i, j] - max_A[i]), name="exp") k1 = tvm.reduce_axis((0, shape[1]), name="k1") sum_exp = tvm.compute((shape[0], ), lambda i: tvm.sum(exp[i, k1], axis=k1), name="sum_exp") softmax = tvm.compute(shape, lambda i, j: exp[i, j] / sum_exp[i], name="softmax") log = tvm.compute(shape, lambda i, j: tvm.log(softmax[i, j]), name="log") k2 = tvm.reduce_axis((0, shape[1]), name="k2") sum_softmax = tvm.compute( (shape[0], ), lambda i: tvm.sum(B[i, k2] * log[i, k2], axis=k2), name="sum_softmax") k3 = tvm.reduce_axis((0, shape[0]), name="k3") softmax_cross_entropy = tvm.compute( (1, ), lambda i: tvm.sum(-1 * sum_softmax[k3] / shape[0], axis=k3)) s = tvm.create_schedule(softmax_cross_entropy.op) f = tvm.build(s, [A, B, softmax_cross_entropy], tgt, target_host=tgt_host, name=func_name) return f
def log_softmax(x): """Perform log softmax activation on the data Parameters ---------- data : tvm.Tensor 2-D input data Returns ------- output : tvm.Tensor 2-D output with same shape """ assert len(x.shape) == 2, "only support 2-dim log softmax" m, n = x.shape k = tvm.reduce_axis((0, n), name='k') max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k)) k = tvm.reduce_axis((0, n), name='k') expsum = tvm.compute( (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k)) return tvm.compute(x.shape, lambda i, j: x[i, j] - max_elem[i] - tvm.log(expsum[i]))
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): assert len(shape) == 2, "only 2d tensor is accepted for batched softmax xent loss" num_batch, num_class = shape logits = tvm.placeholder(shape, dtype, "logits") truth = tvm.placeholder(shape, dtype, "truth") k = tvm.reduce_axis((0, num_class), name="k") max_logits = tvm.compute((num_batch,), lambda i: tvm.max(logits[i, k], axis=k)) logits_shifted = tvm.compute(shape, lambda i, j: logits[i, j] - max_logits[i]) exps = tvm.compute(shape, lambda *i: tvm.exp(logits_shifted(*i))) k = tvm.reduce_axis((0, num_class), name="k") exps_sum = tvm.compute((num_batch,), lambda i: tvm.sum(exps[i, k], axis=k)) neg_pred_log = tvm.compute(shape, lambda i,j: tvm.log(exps_sum[i]) - logits_shifted[i, j]) ewise_prod = tvm.compute(shape, lambda *i: truth(*i) * neg_pred_log(*i)) i = tvm.reduce_axis((0, num_batch), name="i") j = tvm.reduce_axis((0, num_class), name="j") ce_sum = tvm.compute((1,), lambda _: tvm.sum(ewise_prod[i, j], axis=[i, j])) ce_mean = tvm.compute((1,), lambda _: ce_sum[0] / tvm.const(num_batch, dtype)) s = tvm.create_schedule(ce_mean.op) f = tvm.build(s, [logits, truth, ce_mean], tgt, tgt_host, func_name) return f
def log_softmax(x): """Perform log softmax activation on the data Parameters ---------- data : tvm.Tensor 2-D input data Returns ------- output : tvm.Tensor 2-D output with same shape """ assert len(x.shape) == 2, "only support 2-dim log softmax" m, n = x.shape k = tvm.reduce_axis((0, n), name='k') max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k)) k = tvm.reduce_axis((0, n), name='k') expsum = tvm.compute( (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k)) return tvm.compute( x.shape, lambda i, j: x[i, j] - max_elem[i] - tvm.log(expsum[i]))
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (2, 16) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') print('a host ' + str(a_host)) a = tvm.compute(shape, lambda *i: a_host(*i), name='a') a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf') b_buf = tvm.compute( shape, lambda i, j: tvm.log(a_buf[i, j].astype(env.cfg['dtype_w'])), name='b_buf') b = tvm.compute(shape, lambda *i: b_buf(*i), name='b') b_host = tvm.compute(shape, lambda *i: b(*i), name='b_host') s = tvm.create_schedule(b_host.op) # mark variable scopes s[a].set_scope(env.dram_scope) s[b].set_scope(env.dram_scope) s[a_buf].set_scope(env.uni_scratchpad_scope) s[b_buf].set_scope(env.uni_scratchpad_scope) #print # (dir(s[b].op.body)) # mark compiler pragmas s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma) s[b_host].pragma(s[b_host].op.axis[0], env.dma_copy_pragma) s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls) s[b].pragma(s[b].op.axis[0], env.scratchpad_ls) s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0]) # tensorize s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VLOG', mode='inc')) # build print(tvm.lower(s, [a_host, b_host], simple_mode=True)) print(nnpu.lower(s, [a_host, b_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host], 'nnpu', 'llvm', name='nnpu_log') print('function built: ') #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) #??? print('i want to know:') print(ctx.exist) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=1, high=20) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(shape).astype(b_host.dtype), ctx) # run func(a_nd, b_nd) print('run finished') b_np = b_nd.asnumpy() print('a=') print(a_np) print('b=') print(b_np) print('ground truth =') gt = np.log(a_np, dtype=b_host.dtype) print(gt) np.testing.assert_allclose(b_np, gt)
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" X = tvm.placeholder(shape, dtype=dtype, name='X') a1 = tvm.reduce_axis((0, shape[1]), name='a1') MAX_X = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, a1], axis=[a1])) E_X = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - MAX_X(i))) a2 = tvm.reduce_axis((0, shape[1]), name='a2') E_X_SUM = tvm.compute((shape[0], ), lambda i: tvm.sum(E_X[i, a2], axis=[a2])) SOFTMAX_X = tvm.compute(shape, lambda i, j: E_X[i, j] / E_X_SUM(i)) LOG_SOFTMAX_X = tvm.compute(shape, lambda i, j: tvm.log(SOFTMAX_X[i, j])) X_P = tvm.placeholder(shape, dtype=dtype, name='X_P') MUL = tvm.compute(shape, lambda i, j: X_P[i, j] * LOG_SOFTMAX_X[i, j]) a3 = tvm.reduce_axis((0, shape[1]), name='a3') SUM = tvm.compute((shape[0], ), lambda i: tvm.sum(-MUL[i, a3], axis=[a3])) a4 = tvm.reduce_axis((0, shape[0]), name='a4') MEAN = tvm.compute((1, ), lambda i: tvm.sum(SUM[a4] / shape[0], axis=[a4])) # s = tvm.create_schedule([MAX_X.op, E_X.op, E_X_SUM.op, SOFTMAX_X.op, LOG_SOFTMAX_X.op, MUL.op, SUM.op, MEAN.op]) s = tvm.create_schedule(MEAN.op) # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True)) # MAX_X s[MAX_X].bind(MAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[MAX_X].bind(a1, tvm.thread_axis("threadIdx.x")) # E_X s[E_X].bind(E_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[E_X].bind(E_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # E_X_SUM s[E_X_SUM].bind(E_X_SUM.op.axis[0], tvm.thread_axis("blockIdx.x")) s[E_X_SUM].bind(a2, tvm.thread_axis("threadIdx.x")) # SOFTMAX_X s[SOFTMAX_X].bind(SOFTMAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[SOFTMAX_X].bind(SOFTMAX_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # LOG_SOFT_MAX s[LOG_SOFTMAX_X].bind(LOG_SOFTMAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[LOG_SOFTMAX_X].bind(LOG_SOFTMAX_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # MUL s[MUL].bind(MUL.op.axis[0], tvm.thread_axis("blockIdx.x")) s[MUL].bind(MUL.op.axis[1], tvm.thread_axis("threadIdx.x")) # SUM s[SUM].bind(SUM.op.axis[0], tvm.thread_axis("blockIdx.x")) s[SUM].bind(a3, tvm.thread_axis("threadIdx.x")) # MEAN # s[MEAN].bind(a4, tvm.thread_axis("blockIdx.x")) s[MEAN].bind(a4, tvm.thread_axis("threadIdx.x")) # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True)) # block_x = tvm.thread_axis("blockIdx.x") # thread_x = tvm.thread_axis("threadIdx.x") # zo, zi = s[SUM].split(SUM.op.axis[0], 3) # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True)) # s[SUM].bind(zo, block_x) # s[SUM].bind(zi, thread_x) f = tvm.build(s, [X, X_P, MEAN], tgt, target_host=tgt_host, name=func_name) return _export_module(f, func_name, remote)
default='S0', choices=['S0', 'S1', 'SC']) args = parser.parse_args() with ScheduleProcHelper(), nnpu.Environment('./nnpu_config_fp32.yaml'): env = nnpu.get_env() nnpu.set_device(env, type=args.sim) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] assert dtype_w in ['float32', 'float16'], 'when testing activation function, float dtype is needed' shape = (64, ) a = tvm.placeholder(shape, dtype_w, 'a') a_buf = tvm.compute(shape, lambda *i: a(*i), 'a_buf') exp = tvm.compute(shape, lambda i: tvm.exp(a_buf[i]), 'exp') log = tvm.compute(shape, lambda i: tvm.log(a_buf[i]), 'exp') tanh = tvm.compute(shape, lambda i: tvm.tanh(a_buf[i]), 'exp') sigmoid = tvm.compute(shape, lambda i: tvm.sigmoid(a_buf[i]), 'exp') # k = tvm.reduce_axis((0, 16), 'k0') # sum = tvm.compute((1, ), lambda i: tvm.sum(sigmoid[k], axis=k), 'sum') # nnpu.utils.MarkScope(sum) # softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum[0], 'softmax') # nnpu.utils.MarkScope(softmax) # softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax') s = nnpu.create_schedule([exp.op, log.op, tanh.op, sigmoid.op]) # cache write exp_buf = s.cache_write(exp, env.get_scope('buffer0')) log_buf = s.cache_write(log, env.get_scope('buffer0'))
def logsummul(dtype): nn = 512 bb = 32 n = tvm.var('n') n = tvm.convert(nn) b = tvm.var('b') b = tvm.convert(bb) m, l = n, n A = tvm.placeholder((b, n, l), name='A', dtype=dtype) B = tvm.placeholder((b, m, l), name='B', dtype=dtype) k = tvm.reduce_axis((0, l), name='k') k2 = tvm.reduce_axis((0, l), name='k2') M = tvm.compute( (b, m, n), lambda bb, ii, jj: tvm.max(A[bb, jj, k] + B[bb, ii, k], axis=k), name='M') M2 = tvm.compute( (b, m, n), lambda bb, ii, jj: tvm.sum( tvm.exp(A[bb, jj, k2] + B[bb, ii, k2] - M[bb, ii, jj]), axis=k2), #lambda bb, ii, jj: tvm.sum(tvm.exp(A[bb, jj, k2] + B[bb, ii, k2]- M[bb, ii, jj]), axis=k2), name='M2') C = tvm.compute((b, m, n), lambda bb, ii, jj: tvm.log(M2[bb, ii, jj]) + M[bb, ii, jj], name='C') s = tvm.create_schedule(C.op) AA = s.cache_read(A, "shared", [M]) AL = s.cache_read(AA, "local", [M]) BB = s.cache_read(B, "shared", [M]) BL = s.cache_read(BB, "local", [M]) AA2 = s.cache_read(A, "shared", [M2]) AL2 = s.cache_read(AA2, "local", [M2]) BB2 = s.cache_read(B, "shared", [M2]) BL2 = s.cache_read(BB2, "local", [M2]) cfg = autotvm.get_config() cfg.define_knob("y_bn", [32, 64, 128]) cfg.define_knob("x_bn", [32, 64, 128]) cfg.define_knob("y_t", [8, 32, 64]) cfg.define_knob("x_t", [2, 4, 8, 32]) cfg.define_knob("k_split", [1, 2, 8, 16]) unroll = True #cfg.define_knob("y_bn", [64]) #cfg.define_knob("x_bn", [ 64]) #cfg.define_knob("y_t", [8]) #cfg.define_knob("x_t", [8]) #cfg.define_knob("k_split", [8]) b, y, x = s[C].op.axis y_bn = cfg["y_bn"].val x_bn = cfg["x_bn"].val by, y = s[C].split(y, y_bn) bx, x = s[C].split(x, x_bn) y_nthreads = cfg["y_t"].val x_nthreads = cfg["x_t"].val ty, yi = s[C].split(y, nparts=y_nthreads) tx, xi = s[C].split(x, nparts=x_nthreads) thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x") thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y") s[C].reorder(b, by, bx, ty, tx, yi, xi) s[C].bind(b, tvm.thread_axis("blockIdx.z")) s[C].bind(by, tvm.thread_axis("blockIdx.y")) s[C].bind(bx, tvm.thread_axis("blockIdx.x")) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) if unroll: s[C].pragma(yi, "auto_unroll_max_step", 16) def cache_split(shared): s[shared].compute_at(s[C], tx) _, yi, xi = s[shared].op.axis k, = s[shared].op.reduce_axis ko, ki = s[shared].split(k, cfg["k_split"].val) s[shared].reorder(ko, ki, yi, xi) if unroll: s[shared].pragma(ki, "auto_unroll_max_step", 16) return ko, ki ko, ki = cache_split(M) ko2, ki2 = cache_split(M2) def cache_read(shared, AA, AL, BB, BL, ko, ki): s[AA].compute_at(s[shared], ko) s[AL].compute_at(s[shared], ki) s[BB].compute_at(s[shared], ko) s[BL].compute_at(s[shared], ki) _, y, k = s[AA].op.axis ty, yi = s[AA].split(y, nparts=y_nthreads) tx, ki = s[AA].split(k, nparts=x_nthreads) s[AA].reorder(ty, tx, yi, ki) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) if unroll: s[AA].pragma(yi, "auto_unroll_max_step", 16) _, x, k = s[BB].op.axis ty, xi = s[BB].split(x, nparts=y_nthreads) tx, ki = s[BB].split(k, nparts=x_nthreads) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].reorder(ty, tx, xi, ki) if unroll: s[BB].pragma(xi, "auto_unroll_max_step", 16) cache_read(M, AA, AL, BB, BL, ko, ki) cache_read(M2, AA2, AL2, BB2, BL2, ko2, ki2) return s, [A, B, C]