def test_reduce_combiner_simplify(): ck = CanonicalChecker() dummy = tvm.var('dummy') comm_reducer = tvm.comm_reducer prod = comm_reducer(lambda x, y: x*y, lambda t0: tvm.const(1, t0)) sum_or_prod = comm_reducer( lambda x, y: tvm.expr.Select(dummy < 0, x + y, x*y), lambda t0: tvm.expr.Select(dummy < 0, tvm.const(0, t0), tvm.const(1, t0))) sum_and_prod = comm_reducer( lambda x, y: (x[0] + y[0], x[1]*y[1]), lambda t0, t1: (tvm.const(0, t0), tvm.const(5, t0) - tvm.const(4, t0))) some_reducer1 = comm_reducer( lambda x, y: (x[0] + y[0], x[0] + y[0] + x[1] + y[1], x[0]*y[2] + y[0]*x[2], x[1] + y[2], 4.0), lambda t0, t1, t2, t3, t4: (tvm.const(0, t0), tvm.const(1, t1), tvm.const(2, t2), tvm.const(3, t3), tvm.const(4, t4))) k = tvm.reduce_axis((0, 10), name="k") A = tvm.placeholder((10,), name='A') # Test that SimplifyCombiner makes use of vranges ck.analyzer.update(dummy, tvm.arith.ConstIntBound(-10, -4)) ck.verify(sum_or_prod(A[k], k), tvm.sum(A[k], k)) ck.analyzer.update(dummy, tvm.arith.ConstIntBound(5, 9), True) ck.verify(sum_or_prod(A[k], k), prod(A[k], k)) ck.analyzer.update(dummy, tvm.arith.ConstIntBound(-10, 100), True) ck.verify(sum_and_prod((A[k], A[10-k]), k)[0], tvm.sum(A[k], k)) ck.verify(sum_and_prod((A[k], A[10-k]), k)[1], prod(A[10-k], k)) reference_simplified_sources = [[A[0]], [A[0], A[1]], [A[0], A[2]], [A[0], A[1], A[2], A[3]], [A[4]]] for j in range(5): # Here we use the j-th component of the result, so only it and the components it # depends on are left. simplified = ck.analyzer.canonical_simplify( some_reducer1((A[0], A[1], A[2], A[3], A[4]), k)[j]) # Check that the remaining components are the expected ones. for lhs, rhs in zip(simplified.source, reference_simplified_sources[j]): assert tvm.ir_pass.Equal(lhs, rhs) # Test that components with side effects are not removed side_effect = lambda *xs: tvm.make.Call("int32", "dummy", xs, tvm.expr.Call.Intrinsic, None, 0) ck.verify(sum_and_prod((A[k], side_effect(A[10-k])), k)[0], sum_and_prod((A[k], side_effect(A[10-k])), k)[0]) ck.verify(sum_and_prod((side_effect(A[k]), A[10-k]), k)[0], tvm.sum(side_effect(A[k]), k))
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_x", out_dim, num_outputs=2) cfg.define_split("tile_y", batch, num_outputs=2) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_nopack_config(cfg, batch, out_dim, in_dim) vec = cfg["tile_k"].size[-1] k = tvm.reduce_axis((0, in_dim // vec), "k") CC = tvm.compute((batch, out_dim, vec), lambda z, y, x: tvm.sum( data[z, k * vec + x].astype(out_dtype) * weight[y, k * vec + x].astype(out_dtype), axis=k)) kk = tvm.reduce_axis((0, vec), "kk") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), tag="dense_nopack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def test_lstm_cell_inline(): num_step = 128 num_input = 256 num_hidden = 1152 batch_size = 4 # Global transition matrix X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X") Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h") Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.placeholder((num_step, batch_size, num_hidden)) s_state_c = tvm.placeholder((num_step, batch_size, num_hidden)) s_init_c = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") s_init_h = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # LSTM transition k = tvm.reduce_axis((0, num_input), name="ki2h") s_i2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k), name="s_i2h") k = tvm.reduce_axis((0, num_hidden), name="ki2h") s_h2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules gates = tvm.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates") gshape = (num_step, batch_size, num_hidden) in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate") in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform") forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate") next_c = tvm.compute(gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") next_h = tvm.compute(gshape, lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c") update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h") # schedule scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[X], name="lstm_scan") # schedule s = tvm.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() # verify we can lower correctly tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
def test_tensor_reduce_multi_axis(): m = tvm.var('m') n = tvm.var('n') A = tvm.placeholder((m, n), name='A') k1 = tvm.reduce_axis((0, n), "k") k2 = tvm.reduce_axis((0, m), "k") C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2))) C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
def test_reduce_simplify(): ck = CanonicalChecker() k = tvm.reduce_axis((0, 10), name="k") j = tvm.reduce_axis((-5, 3), name="j") A = tvm.placeholder((10,), name='A') ck.verify(tvm.sum(tvm.expr.Select(k + j < 12, k + j, 0), [k, j]), tvm.sum(k + j, [k, j])) ck.verify(tvm.sum(A[3], []), A[3]) # The rule below is not typical, removed for now ck.verify(tvm.sum(k / 10, k), tvm.sum(tvm.const(0, "int32"), k))
def _conv(n, h, w, co, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if dorefa: return tvm.sum( (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) & kernel_vec[co, dh, dw, ci, vc, b2].astype(out_dtype)) - tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) & ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2, axis=[dh, dw, ci, b1, b2]) return tvm.sum(tvm.popcount( data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, axis=[dh, dw, ci, b1, b2])
def _conv(n, co, h, w, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if unipolar: return tvm.sum((tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype)) - tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2, axis=[ci, dh, dw, b1, b2]) return tvm.sum((tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, axis=[ci, dh, dw, b1, b2])
def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype)) roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32') else: roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32') roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32') count = roi_bin_grid_h * roi_bin_grid_w rh = tvm.reduce_axis((0, roi_bin_grid_h)) rw = tvm.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w return tvm.sum(_bilinear(batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count, axis=[rh, rw])
def binary_dense(data, weight): """Binary matrix multiplication using xor and bit-count. Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim], dtype is uint32. weight : tvm.Tensor 2-D with shape [out_dim, in_dim], dtype is uint32. Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim], dtype is float32. """ assert data.dtype == 'uint32' and weight.dtype == 'uint32', \ "dtype of data and weight should be uint32" assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim binary dense" batch, in_dim = data.shape out_dim, _ = weight.shape k = tvm.reduce_axis((0, in_dim), name='k') matmul = tvm.compute((batch, out_dim), lambda i, j: \ tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \ tag='binary_dense') return tvm.compute((batch, out_dim), lambda i, j: \ 32 * in_dim - 2. * matmul(i, j), \ tag=tag.ELEMWISE)
def matmul_v1(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] # 2. get the config object cfg = autotvm.get_config() # 3. define search space cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) # 4. schedule according to config yo, yi = s[C].split(y, cfg['tile_y'].val) xo, xi = s[C].split(x, cfg['tile_x'].val) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def matmul(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def global_pool(data, pool_type): """Perform global pooling on the data Parameters ---------- data : tvm.Tensor 4-D with shape [batch, channel, in_height, in_width] pool_type : str Pool type, 'max' or 'avg' Returns ------- output : tvm.Tensor 4-D with shape [batch, channel, 1, 1] """ assert len(data.shape) == 4, "only support 4-dim pooling" batch, channel, height, width = data.shape dheight = tvm.reduce_axis((0, height)) dwidth = tvm.reduce_axis((0, width)) if pool_type == 'max': return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \ tag="global_pool_max") elif pool_type == 'avg': tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \ tag="global_pool_sum") return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \ tag=tag.ELEMWISE) else: raise ValueError("Pool type should be 'avg' or 'max'.")
def test_dot(): nn = 12 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') k = tvm.reduce_axis((0, n), 'k') C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C') s = tvm.create_schedule(C.op) fapi = lower(s, [A, B, C]) def verify(target): if not tvm.module.enabled(target): print("Target %s is not enabled" % target) return f = tvm.codegen.build_module(fapi, target) # verify ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4) verify("llvm")
def get_gemm_feature(target): k = tvm.reduce_axis((0, N), 'k') A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='B') C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k), name='C') s = tvm.create_schedule(C.op) y, x = s[C].op.axis axes = list(s[C].tile(y, x, 8, 8)) + [k] perm = np.random.permutation(5) axes = [axes[x] for x in perm] s[C].reorder(*axes) if "gpu" in target.keys: pick = [] # filter out reduction axis for i in range(len(perm)): if perm[i] != 4: pick.append(axes[i]) s[C].bind(pick[0], tvm.thread_axis("blockIdx.x")) s[C].bind(pick[1], tvm.thread_axis("vthread")) s[C].bind(pick[2], tvm.thread_axis("threadIdx.y")) with target: feas = feature.get_itervar_feature(s, [A, B, C]) feas = feature.flatten_itervar_feature(feas) return feas
def packed_conv2d(data, kernel, padding, strides, out_dtype="int32"): """ Packed conv2d function. """ if padding[0]: pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") else: pad_data = data assert len(data.shape) == 6 assert len(kernel.shape) == 6 oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1) owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1) oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4]) ishape = topi.util.get_const_tuple(data.shape) kshape = topi.util.get_const_tuple(kernel.shape) assert data.dtype == "int8", data.dtype assert kernel.dtype == "int8", kernel.dtype d_i = tvm.reduce_axis((0, kshape[2]), name='d_i') d_j = tvm.reduce_axis((0, kshape[3]), name='d_j') k_o = tvm.reduce_axis((0, ishape[1]), name='k_o') k_i = tvm.reduce_axis((0, ishape[-1]), name='k_i') hstride, wstride = strides res = tvm.compute( oshape, lambda b_o, c_o, i, j, b_i, c_i: tvm.sum( pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) * kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype), axis=[k_o, d_i, d_j, k_i]), name="res", tag="packed_conv2d") return res
def intrin_gemv(m, n): w = tvm.placeholder((m, n), name='w') x = tvm.placeholder((n,), name='x') k = tvm.reduce_axis((0, n), name='k') z = tvm.compute((m,), lambda i: tvm.sum(w[i, k] * x[k], axis=k), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=16, strides=[tvm.var('ldw'), 1]) def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed( "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) reset = tvm.call_packed( "fill_zero", zz_ptr, n) update = tvm.call_packed( "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, reset, update with tvm.build_config(data_alignment=16, offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb})
def test_conv_tiling(): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.LoopPartition(stmt, True) stmt = tvm.ir_pass.Simplify(stmt) assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_y", batch, num_outputs=3) cfg.define_split("tile_x", out_dim, num_outputs=3) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_pack_config(cfg, batch, out_dim, in_dim) packw_bn = cfg["tile_x"].size[-1] packw_shape = (out_dim // packw_bn, in_dim, packw_bn) packw = tvm.compute(packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") k = tvm.reduce_axis((0, in_dim), name="k") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum( data[y, k].astype(out_dtype) * packw[x // packw_bn, k, x % packw_bn].astype(out_dtype), axis=k), tag="dense_pack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def intrin_gemv(m, l): a = tvm.placeholder((l,), name='a') b = tvm.placeholder((m, l), name='b') k = tvm.reduce_axis((0, l), name='k') c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c') Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[tvm.var("s1"), 1]) Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) def intrin_func(ins, outs): ib = tvm.ir_builder.create() aa, bb = ins cc = outs[0] ib.emit(tvm.call_extern("int32", "gemv_update", cc.access_ptr("w"), aa.access_ptr("r"), bb.access_ptr("r"), m, l, bb.strides[0])) return ib.get() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def dense_default(data, weight, bias=None): """The default implementation of dense in topi. Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim] weight : tvm.Tensor 2-D with shape [out_dim, in_dim] bias : tvm.Tensor, optional 1-D with shape [out_dim] Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim] """ assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim dense" if bias is not None: assert len(bias.shape) == 1 batch, in_dim = data.shape out_dim, _ = weight.shape k = tvm.reduce_axis((0, in_dim), name='k') matmul = tvm.compute((batch, out_dim), \ lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \ tag='dense') if bias is not None: matmul = tvm.compute((batch, out_dim), \ lambda i, j: matmul[i, j] + bias[j], \ tag=tag.BROADCAST) return matmul
def test_local_gemm(): if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A', dtype='int32') B = tvm.placeholder((m, l), name='B', dtype='int32') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') s = tvm.create_schedule(C.op) s[C].opengl() print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opengl", name="gemm") print("------opengl code------") print(f.imported_modules[0].get_source(fmt="gl")) ctx = tvm.opengl() n, m, l = nn, nn, nn a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype) b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
def test_rfactor(): n = tvm.var('n') k1 = tvm.reduce_axis((0, n), name="k1") k2 = tvm.reduce_axis((0, n), name="k2") A = tvm.placeholder((n, n, n), name='A') B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2])) # normal schedule s = tvm.create_schedule(B.op) BF = s.rfactor(B, k1) assert(tuple(BF.shape) == (n, n)) assert(set(BF.op.body[0].axis) == set([k2])) assert(s[B].op.body[0].axis[0].dom.extent == n) assert(len(s[B].all_iter_vars) == 2) # schedule with splot s = tvm.create_schedule(B.op) ko, ki = s[B].split(k1, factor=4) xo, xi = s[B].split(B.op.axis[0], factor=8) BF = s.rfactor(B, ki) assert(BF.shape[0].value == 4) assert(BF.shape[1] == n) assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4) # schedule with factor_axis s = tvm.create_schedule(B.op) ko, ki = s[B].split(k1, factor=4) xo, xi = s[B].split(B.op.axis[0], factor=8) BF = s.rfactor(B, ki, 1) assert(n == BF.shape[0]) assert(BF.shape[1].value == 4) assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
def test_in_bounds_conv_llvm(loop_tiling=False): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis if loop_tiling: oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) print (lowered_func.body) ctx = tvm.cpu (0) f = tvm.build(s, [data, kernel, conv], "llvm") data_input = tvm.nd.array(np.random.uniform( size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx) kernel_input = tvm.nd.array(np.random.uniform( size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx) conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx) f(data_input, kernel_input, conv_out)
def test_rfactor(): n = tvm.convert(1027) A = tvm.placeholder((n,), name='A') k = tvm.reduce_axis((0, n)) B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B') # schedule s = tvm.create_schedule(B.op) kf, ki = s[B].split(k, nparts=4) BF = s.rfactor(B, kf) s[BF].parallel(BF.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.module.enabled(target): return ctx = tvm.cpu(0) fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose( b.asnumpy(), res, rtol=1e-4) check_target()
def _spatial_pack(data, kernel, stride, padding, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = KH-1, KW-1 VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll TH = H + 2*HPAD TW = W + 2*WPAD OH = (H + 2*HPAD - KH) // HSTR + 1 OW = (W + 2*WPAD - KW) // WSTR + 1 dshape = (1, CI, H, W) dpshape = (1, CI, TH, TW) dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT) kshape = (CO, CI, KH, KW) kvshape = (CO/VC, CI, KH, KW, VC) ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (1, CO, OH, OW) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \ data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \ kernel[co*VC+vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) * kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC], name='output_unpack', tag='spatial_conv_output') return output
def test_make_sum(): A = tvm.placeholder((2, 10), name='A') k = tvm.reduce_axis((0,10), "k") B = tvm.compute((2,), lambda i: tvm.sum(A[i, k], axis=k), name="B") json_str = tvm.save_json(B) BB = tvm.load_json(json_str) assert B.op.body[0].combiner is not None assert BB.op.body[0].combiner is not None
def _conv(nn, ff, yy, xx): b1b2 = (b1+b2).astype(out_dtype) return tvm.sum( ((tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & Filter_q[ff, rc, ry, rx, b2]) - tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & ~Filter_q[ff, rc, ry, rx, b2])) << (b1b2)).astype(out_dtype), axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
def dp4a(x_scope='local', y_scope='local', z_scope='local'): """ Int8 dot product reduced by every 4 elements using __dp4a Parameters ---------- x_scope : str, optional The storage scope of buffer for lhs y_scope : str, optional The storage scope of buffer for rhs z_scope : str, optional The storage scope of buffer for result Returns ------- intrin : TensorIntrin The dp4a TensorIntrin that can be used in tensorizing schedule. """ n = 4 # dp4a requires operands packed by 4 x = tvm.placeholder((n,), name='x', dtype='int8') y = tvm.placeholder((n,), name='y', dtype='int8') k = tvm.reduce_axis((0, n), name='rc') z = tvm.compute((1,), lambda i: tvm.sum( x[k].astype('int32') * y[k].astype('int32'), axis=[k])) def _intrin_func(ins, outs): def _instr(index): xx, yy = ins zz = outs[0] if index == 1: return zz.vstore(0, 0) ib = tvm.ir_builder.create() vec_x = xx.vload(0, dtype='int8x4') vec_y = yy.vload(0, dtype='int8x4') prev_z = 0 if index == 0 else zz.vload(0) new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) ib.emit(zz.vstore(0, new_z)) return ib.get() return _instr(0), _instr(1), _instr(2) # body, reset, update with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: scopes = {x: x_scope, y: y_scope, z: z_scope} binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, data_alignment=cfg.data_alignment, offset_factor=cfg.offset_factor, scope=scopes[t]) for t in [x, y, z]} return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
def test_rank_zero(): m = tvm.var('m') A = tvm.placeholder((m,), name='A') scale = tvm.placeholder((), name='s') k = tvm.reduce_axis((0, m), name="k") T = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k)) print(T) print(T.op.body) assert(tuple(T.shape) == ())
def test_gemm_bound(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n, n), name='A') B = tvm.placeholder((n, n), name='B') k = tvm.reduce_axis((0, n), name='k') C = tvm.compute( (n, n), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s = s.normalize() bounds = tvm.schedule.InferBound(s) assert(bounds[BB.op.axis[0]].extent.value==64) assert(bounds[AA.op.axis[0]].extent.value==64) assert(bounds[CC.op.axis[0]].extent.value == 8) assert(bounds[CC.op.axis[1]].extent.value == 8)
def test_tensorize_matmul(): n = 1024 m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda i, j: tvm.sum(B[j, k] * A[i, k], axis=k), name='C') def check(factor): s = tvm.create_schedule(C.op) x, y = C.op.axis yo, yi = s[C].split(y, factor=factor) gemv = intrin_gemv(factor, l) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) s[C].reorder(yo, ro, yi, ri) gemv = intrin_gemv(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor_no_reset(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) s[C].reorder(yo, ro, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor_no_reset_multi_reduction(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) roo, roi = s[C].split(ro, factor=2) s[C].reorder(yo, roo, roi, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) check(16) check_rfactor(16, 16) check_rfactor_no_reset(16, 16) check_rfactor_no_reset_multi_reduction(16, 16)
def _run(env, remote): # declare o = 4 n = 1 m = 4 x = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype) w = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype) x_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf") w_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf") ko = tvm.reduce_axis((0, n), name="ko") ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki") y_gem = tvm.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda bo, co, bi, ci: tvm.sum(x_buf[bo, ko, bi, ki].astype( env.acc_dtype) * w_buf[co, ko, ci, ki].astype(env.acc_dtype), axis=[ko, ki]), name="y_gem") y_shf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_gem(*i) >> 8, name="y_shf") y_max = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.max(y_shf(*i), 0), "y_max") #relu y_min = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.min(y_max(*i), (1 << (env.INP_WIDTH - 1)) - 1), "y_min") #relu y = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_min(*i).astype(env.inp_dtype), name="y") if not remote: return def verify(s): mod = vta.build(s, [x, w, y], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, ctx) w_nd = tvm.nd.array(w_np, ctx) y_nd = tvm.nd.array(y_np, ctx) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b, i, :] += np.dot( x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype) if env.TARGET == "sim": simulator.clear_stats() f(x_nd, w_nd, y_nd) print(simulator.stats()) else: f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) def test_schedule1(): # default schedule with no smt s = tvm.create_schedule(y.op) # set the scope of the SRAM buffers s[x_buf].set_scope(env.inp_scope) s[w_buf].set_scope(env.wgt_scope) s[y_gem].set_scope(env.acc_scope) s[y_shf].set_scope(env.acc_scope) s[y_max].set_scope(env.acc_scope) s[y_min].set_scope(env.acc_scope) # set pragmas for DMA transfer and ALU ops s[x_buf].compute_at(s[y_gem], ko) s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy) s[w_buf].compute_at(s[y_gem], ko) s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy) s[y_shf].pragma(s[y_shf].op.axis[0], env.alu) s[y_max].pragma(s[y_max].op.axis[0], env.alu) s[y_min].pragma(s[y_min].op.axis[0], env.alu) s[y].pragma(s[y].op.axis[0], env.dma_copy) # tensorization s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1], s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki) s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm) verify(s) def test_smt(): # test smt schedule s = tvm.create_schedule(y.op) s[x_buf].set_scope(env.inp_scope) s[w_buf].set_scope(env.wgt_scope) s[y_gem].set_scope(env.acc_scope) s[y_shf].set_scope(env.acc_scope) s[y_max].set_scope(env.acc_scope) s[y_min].set_scope(env.acc_scope) abo, aco, abi, aci = s[y].op.axis abo1, abo2 = s[y].split(abo, nparts=2) s[y].bind(abo1, tvm.thread_axis("cthread")) s[y_gem].compute_at(s[y], abo1) s[y_shf].compute_at(s[y], abo1) s[y_max].compute_at(s[y], abo1) s[y_min].compute_at(s[y], abo1) s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1], s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki) s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm) s[y_shf].pragma(s[y_shf].op.axis[0], env.alu) s[y_max].pragma(s[y_max].op.axis[0], env.alu) s[y_min].pragma(s[y_min].op.axis[0], env.alu) s[x_buf].compute_at(s[y_gem], ko) s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy) s[w_buf].compute_at(s[y_gem], ko) s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy) s[y].pragma(abo2, env.dma_copy) verify(s) test_schedule1() test_smt()
def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): batch, in_channel, in_height, in_width = [ util.get_const_int(x) for x in data.shape ] num_filter, channel, kernel_h, kernel_w = [ util.get_const_int(x) for x in kernel.shape ] pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): stride_h, stride_w = stride else: stride_h, stride_w = stride, stride out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) oshape = (batch, out_channel, out_height, out_width) pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(data, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') block_w = 0 block_h = 0 if stride_h == 2: if num_filter + kernel_h == 515: conv_tag = "4_4" block_h = 4 block_w = 4 else: conv_tag = "4_5" block_h = 4 block_w = 5 elif kernel_h == 3: if num_filter == 512: conv_tag = "2_7" block_h = 2 block_w = 7 else: conv_tag = "2_14" block_h = 2 block_w = 14 else: conv_tag = "1_16" block_h = 1 block_w = 16 c_h = out_height c_w = out_width if not out_height % block_h == 0: c_h = (out_height // block_h + 1) * block_h if not out_width % block_w == 0: c_w = (out_width // block_w + 1) * block_w nv = 16 cshape = (batch, out_channel // nv, c_h, c_w, nv) kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) kernel_vec = tvm.compute( kvshape, lambda co, ci, kh, kw, vc: kernel[co * nv + vc][ci][kh][kw], name='kernel_vec') conv = tvm.compute( cshape, lambda nn, ff, yy, xx, vc:\ tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), axis=[rc, ry, rx]), tag=conv_tag, name='conv') output = tvm.compute( oshape, lambda nn, ff, yy, xx: conv[nn][ff // nv][yy][xx][ff % nv], name='output_unpack', tag=conv_tag) return output
def dot_16x1x16_int8_int8_int32(): """ Int8 dot product by every 4 elements using AVX2 Skylake instructions. This function takes two arrays of int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ out[i] = 0; for (int k = 0; k < 4; k++){ out[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = 16 # 16 int32 lanes in AVX512 num_int8_elements = 4 # 4 int8 elements in int32 data = tvm.placeholder((num_int8_elements, ), dtype='uint8', name='data') kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel') k = tvm.reduce_axis((0, num_int8_elements), name='k') C = tvm.compute( (int32_lanes, ), lambda i: tvm.sum( data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k), name="C") a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", offset_factor=1, strides=[tvm.var('ldw'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin( 'int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vec_a, vec_b) quad_reduction = tvm.call_llvm_intrin( 'int32x16', 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), pair_reduction, vec_one) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore( 0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer })
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") idxd = tvm.indexdiv idxm = tvm.indexmod r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) K = CO C = CI H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m-1) // m, (W + m-1) // m P = N * nH * nW cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16) cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16) VP = cfg['tile_p'].size[-1] VK = cfg['tile_k'].size[-1] # pack input tile input_tile = tvm.compute((C, idxd(P, VP), alpha, alpha, VP), lambda c, b, eps, nu, bb: data_pad[idxd(b*VP + bb, nH*nW), c, idxm(idxd(b*VP + bb, nW), nH) * m + eps, idxm(b*VP + bb, nW) * m + nu], name='d') # transform kernel if pre_computed: U = kernel else: r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute((alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V') # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b: tvm.sum(U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] * V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c), name='M') # inverse transform r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]), name='Y') # unpack output output = tvm.compute((N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m), idxm(h, m), idxm(w, m)], name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * K * H * W * KH * KW * C) return output
A0 = tvm.placeholder((n, ), name='A0', dtype='float32') A1 = tvm.placeholder((n, ), name='A1', dtype='float32') A2 = tvm.placeholder((n, ), name='A2', dtype='float32') B0 = tvm.placeholder((n, ), name='B0', dtype='float32') B1 = tvm.placeholder((n, ), name='B1', dtype='float32') B2 = tvm.placeholder((n, ), name='B2', dtype='float32') D = tvm.placeholder((n, ), name='D', dtype='float32') D_ij = lambda i : (A0[i] - B0[j]) * (B0[j] - A0[i]) \ + (A1[i] - B1[j]) * (B1[j] - A1[i]) \ + (A2[i] - B2[j]) * (B2[j] - A2[i]) K_ij = lambda i: tvm.call_pure_extern("float32", "__expf", D_ij(i)) C0 = tvm.compute((n, ), lambda i: tvm.sum(K_ij(i) * D[j], axis=j), name="C0") # Scheduled the computation s0 = tvm.create_schedule(C0.op) bx, tx = s0[C0].split(C0.op.axis[0], factor=192) s0[C0].bind(bx, tvm.thread_axis("blockIdx.x")) s0[C0].bind(tx, tvm.thread_axis("threadIdx.x")) # Actually build the binary fconv0 = tvm.build(s0, [A0, A1, A2, B0, B1, B2, D, C0], tgt, target_host=tgt_host, name="myconv0") # Benchmark nits = 10
a_buf = tvm.compute( shape1_tiled, lambda ico, no, ni, ici: a[no * gemm_shape[0] + ni, ico * factor + ici], 'a_buf') b_buf = tvm.compute( shape2_tiled, lambda ico, oco, oci, ici: b[oco * gemm_shape[ 2] + oci, ico * factor + ici], 'b_buf') out_shape_tiled = (shape1_tiled[1], shape2_tiled[1], shape1_tiled[2], shape2_tiled[2]) ko = tvm.reduce_axis((0, shape1[1] // factor), 'ko') ki = tvm.reduce_axis((0, factor), 'ki') out_buf = tvm.compute( out_shape_tiled, lambda xo, yo, xi, yi: tvm.sum(a_buf[ko, xo, xi, ki].astype(dtype_w) * b_buf[ko, yo, yi, ki].astype(dtype_w), axis=[ko, ki]), 'out_buf') out_acc = out_buf # nnpu.utils.MarkScope(out_acc, 'acc') # out_buf = tvm.compute(out_shape_tiled, lambda *i: out_acc(*i), 'out_host') # nnpu.utils.MarkScope(out_buf) out_host = tvm.compute(out_shape_tiled, lambda *i: out_buf(*i), 'out_host') # schedule s = nnpu.create_schedule(out_host.op) # al = s.cache_read(a_buf, env.get_scope('buffer1'), out_acc) # bl = s.cache_read(b_buf, env.get_scope('buffer2'), out_acc) al = a_buf bl = b_buf a_buffer_scope = 'buffer1'
import os input(os.getpid()) tgt_host = "c" device = "dpu" M = tvm.var("M") K = tvm.var("K") N = tvm.var("N") A = tvm.placeholder((M, K), name='A', dtype='float32') B = tvm.placeholder((K, N), name='B', dtype='float32') k = tvm.reduce_axis((0, K), 'k') C = tvm.compute((M, N), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) func = tvm.build(s, [A, B, C], target_host=tgt_host, name='DPUGemm') #print("------------------DPU_LOWER code---------------------") #print(tvm.lower(s, [A, B, C], simple_mode=True)) """ scale = 4 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis( "threadIdx.x") block_y = tvm.thread_axis("blockIdx.y")
def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2): """Perform the across channels local response normalisation on the input data. sum_sqr_up^i{x, y} = (bias+((alpha/size)* \ {sum_{j=max(0, i-size/2)}^{min(N-1,i+size/2)} \ (data^j{x,y})^2}))^beta output^i{x, y} = data^i{x, y}/sum_sqr_up^i{x, y} N is the number for input channels Parameters ---------- data : tvm.Tensor 4-D with shape [batch, channel, height, width] size : int normalisation window size axis : int input data layout channel axis default value is 1 for NCHW format bias : float offset to avoid dividing by 0 alpha : float to be divided beta : float exponent Returns ------- output : tvm.Tensor 4-D output with same shape """ assert len(data.shape) == 4, "only support 4-dim lrn" assert (size % 2) == 1, "size should be odd number" assert (axis == 1) or (axis == 3), "axis should 1 or 3 for NCHW and NHWC" ##Add padding on left & right of size radius first pad_after = pad_before = [0, 0, 0, 0] pad_after[axis] = pad_before[axis] = (size // 2) pad_data = pad(data, pad_before, pad_after, name="pad_data") rxs = tvm.reduce_axis((0, size), name='rxs') if axis == 1: #NCHW layout sqr_sum = tvm.compute( data.shape, lambda i, j, k, l: tvm.sum(pad_data[i, j + rxs, k, l] * pad_data[i, j + rxs, k, l], axis=rxs)) elif axis == 3: #NHWC layout sqr_sum = tvm.compute( data.shape, lambda i, j, k, l: tvm.sum(pad_data[i, j, k, l + rxs] * pad_data[i, j, k, l + rxs], axis=rxs)) sqr_sum_up = tvm.compute( data.shape, lambda i, j, k, l: tvm.power( (bias + (alpha * sqr_sum[i, j, k, l] / size)), beta)) return topi.broadcast_div(data, sqr_sum_up)
def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype=None): out_dtype = data.dtype if out_dtype is None else out_dtype batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple( data.shape) out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \ = get_const_tuple(kernel.shape) strides = strides if isinstance(strides, (tuple, list)) else (strides, strides) HSTR, WSTR = strides pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (filter_height, filter_width)) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) assert (dh, dw) == (1, 1), "Does not support dilation" in_channel = in_channel_chunk * in_channel_block out_channel = out_channel_chunk * out_channel_block channel_multiplier = out_channel // in_channel out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1 out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1 # get workload and related schedule config wkl = _get_workload( tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype), tvm.placeholder((out_channel, in_channel, filter_height, filter_width), dtype=kernel.dtype), strides, padding, out_dtype) if cfg.is_fallback: _fallback_schedule(cfg, wkl) # padding stage DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0) if DOPAD: pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] data_pad = pad(data, pad_before, pad_after, name="PaddedInput") else: data_pad = data # depthconv stage idxdiv = tvm.indexdiv idxmod = tvm.indexmod kh = tvm.reduce_axis((0, filter_height), name='kh') kw = tvm.reduce_axis((0, filter_width), name='kw') Output = tvm.compute( (batch, out_channel_chunk, out_height, out_width, out_channel_block), lambda b, oco, oh, ow, oci: tvm.sum((data_pad[ b, idxdiv(idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block), oh * HSTR + kh, ow * WSTR + kw, idxmod(idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block)].astype(out_dtype) * kernel[ oco, 0, kh, kw, 0, oci].astype(out_dtype)), axis=[kh, kw]), name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc") return Output
def test_vectorize_commreduce(): V = tvm.placeholder((128, ), name='V') ax = tvm.reduce_axis((0, 128), name='ax') O = tvm.compute((1, ), lambda _: tvm.sum(V[ax], axis=[ax])) s = tvm.create_schedule(O.op) s[O].vectorize(ax) # should throw here
def intrin_libxsmm_tuned(ofmblock, ofw, ifmblock, stride_width, ifw, rco, ifh, r, s, ifh_stride, ifw_stride, in_channel): last_input_width_index = (ofw - 1) * stride_width + s - 1 A = tvm.placeholder((rco, r, s, ifmblock, ofmblock), name='w') B = tvm.placeholder((rco, r, last_input_width_index + 1, ifmblock), name='b') k = tvm.reduce_axis((0, ifmblock), name='k') k_outer = tvm.reduce_axis((0, rco), name='k_outer') ry = tvm.reduce_axis((0, r), name='ry') rx = tvm.reduce_axis((0, s), name='rx') C = tvm.compute((ofw, ofmblock), lambda m, n: tvm.sum(A[k_outer, ry, rx, k, n] * B[ k_outer, ry, rx + m * stride_width, k], axis=[k_outer, ry, rx, k]), name='out') s1 = tvm.create_schedule(C.op) w, ofm = s1[C].op.axis kco, ky, kx, kci = s1[C].op.reduce_axis s1[C].reorder(kco, ky, kx, w, ofm, kci) xx_ptr = tvm.decl_buffer(A.shape, A.dtype, name="W", offset_factor=1, data_alignment=64) yy_ptr = tvm.decl_buffer( B.shape, B.dtype, name="some", offset_factor=1, strides=[tvm.var("s3"), tvm.var("s2"), ifmblock, 1], data_alignment=64) zz_ptr = tvm.decl_buffer(C.shape, C.dtype, name="OUT", offset_factor=1, data_alignment=64) def intrin_func(ins, outs): # tvm call extern is used to interface to libxsmm batch reduce kernel gemm implementation # rco*r*s is the number of batches init_and_compute = tvm.call_extern ("int32","batch_reduce_kernel_init_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"),\ rco*r*s,ofmblock,ifmblock,r,s,ifh_stride,ifw_stride, ofw, stride_width) reset = tvm.call_extern("int32", "batch_reduce_kernel_init", outs[0].access_ptr("w"), ofmblock, ofw) body = tvm.call_extern ("int32","batch_reduce_kernel_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"), rco*r*s,ofmblock,\ ifmblock,ofw, stride_width,r,s, ifh_stride,ifw_stride) if math.ceil(in_channel / ifmblock) == rco: return init_and_compute, None, init_and_compute else: return init_and_compute, reset, body with tvm.build_config(data_alignment=64): return tvm.decl_tensor_intrin(C.op, intrin_func, name="GEMM", binds={ A: xx_ptr, B: yy_ptr, C: zz_ptr })
def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): """Depthwise convolution nchw forward operator. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, channel_multiplier, filter_height, filter_width] stride : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype: str, optional Output data type Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ out_dtype = Input.dtype if out_dtype is None else out_dtype if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel, in_height, in_width = Input.shape # shape of dilated kernel filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape dilated_kernel_h = (filter_height - 1) * dilation_h + 1 dilated_kernel_w = (filter_width - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # padding stage pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") # depthconv stage idxdiv = tvm.indexdiv idxmod = tvm.indexmod di = tvm.reduce_axis((0, filter_height), name='di') dj = tvm.reduce_axis((0, filter_width), name='dj') Output = tvm.compute( (batch, out_channel, out_height, out_width), lambda b, c, i, j: tvm.sum( (PaddedInput[b, idxdiv(c, channel_multiplier), i*stride_h+di*dilation_h, j*stride_w+dj*dilation_w].astype(out_dtype) * Filter[idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier), di, dj].astype(out_dtype)), axis=[di, dj]), name='DepthwiseConv2d', tag="depthwise_conv2d_nchw") return Output
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW ##### space definition begin ##### tile_bna_candidates = [1, 2, 4, 8, 16] factors = get_factors(CO) cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors]) cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16]) cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128) cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128) cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8) cfg.define_knob('yt', [1, 2, 4, 8, 16, 32]) ##### space definition end ##### if cfg.is_fallback: cfg['tile_bnb'].val = 4 cfg['tile_bna'].val = 4 while CO % cfg['tile_bna'].val != 0: cfg['tile_bna'].val //= 2 cfg['yt'].val = 8 cfg.fallback_split('tile_t1', [-1, 128]) cfg.fallback_split('tile_t2', [-1, 128]) cfg.fallback_split('c_unroll', [-1, 8]) bna = cfg['tile_bna'].val bnb = cfg['tile_bnb'].val P_round = (P + bnb - 1) // bnb * bnb assert CO % bna == 0 and P_round % bnb == 0 # pack input tile input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \ tvm.if_then_else( b * bnb + bb < P, data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps] [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d') # transform kernel if pre_computed: U = kernel else: r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute( (alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][ r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][ r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='V') idxdiv = tvm.indexdiv idxmod = tvm.indexmod # batch gemm ci = tvm.reduce_axis((0, CI), name='c') M = tvm.compute( (alpha, alpha, CO, P_round), lambda eps, nu, co, p: tvm.sum(U[eps][nu][idxdiv(co, bna)][ci][idxmod( co, bna)] * V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)], axis=ci), name='M') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') Y = tvm.compute( (CO, P, m, m), lambda co, p, vh, vw: tvm.sum( M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='Y') # unpack output output = tvm.compute( (N, CO, H, W), lambda n, co, h, w: Y[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv( w, m), idxmod(h, m), idxmod(w, m)] # The following hack term is used to make the padding in batch gemm ("M") # effective, otherwise the padding will be eliminated by bound inference. # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding. + tvm.expr.Mul(tvm.const(0, out_dtype), M[alpha - 1][alpha - 1][CO - 1] [P_round - 1]), name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * CO * H * W * KH * KW * CI) return output
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed): """Compute declaration for winograd""" assert layout == 'NCHW' tile_size = _infer_tile_size(data, kernel) N, CI, H, W = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides if not pre_computed: # kernel tensor is raw tensor, do strict check if dilation_h != 1 or dilation_w != 1: kernel = dilation(kernel, (1, 1, dilation_h, dilation_w)) CO, CI, KH, KW = get_const_tuple(kernel.shape) alpha = KW + tile_size - 1 assert HSTR == 1 and WSTR == 1 and KH == KW else: # kernel tensor is pre-transfomred. this op is created by alter op layout. # dilation is not supported alpha, _, CI, CO = get_const_tuple(kernel.shape) KH = KW = alpha + 1 - tile_size assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1 pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW)) data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") r = KW m = tile_size A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (H + pt + pb - KH) // HSTR + 1 W = (W + pl + pr - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW # transform kernel if not pre_computed: r_kh = tvm.reduce_axis((0, KH), name='r_kh') r_kw = tvm.reduce_axis((0, KW), name='r_kw') kernel_pack = tvm.compute( (alpha, alpha, CI, CO), lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps] [r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='kernel_pack') else: kernel_pack = kernel idxdiv = tvm.indexdiv idxmod = tvm.indexmod # pack input tile input_tile = tvm.compute( (CI, P, alpha, alpha), lambda c, p, eps, nu: data_pad[idxdiv(p, (nH * nW))][c][idxmod( idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu], name='d') # transform data r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][ r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='data_pack') # do batch gemm ci = tvm.reduce_axis((0, CI), name='ci') bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][ ci][co] * data_pack[eps][nu][ci][p], axis=[ci]), name='bgemm') # inverse transform r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') inverse = tvm.compute( (CO, P, m, m), lambda co, p, vh, vw: tvm.sum( bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='inverse') # output output = tvm.compute((N, CO, H, W), lambda n, co, h, w: inverse[co, n * nH * nW + idxdiv( h, m) * nW + idxdiv(w, m), idxmod(h, m), idxmod(w, m)], name='output', tag='conv2d_nchw_winograd') cfg.add_flop(2 * N * CO * H * W * CI * KH * KW) return output
'N = ' + str(N) + '\n' 'dtype = "float32"\n' 'a = numpy.random.rand(M, K).astype(dtype)\n' 'b = numpy.random.rand(K, N).astype(dtype)\n', stmt='answer = numpy.dot(a, b)', number=np_repeat) print("Numpy running time: %f" % (np_runing_time / np_repeat)) answer = numpy.dot(a.asnumpy(), b.asnumpy()) # Algorithm k = tvm.reduce_axis((0, K), 'k') A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k), name='C') # Default schedule s = tvm.create_schedule(C.op) func = tvm.build(s, [A, B, C], target=target, name='mmult') assert func c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) evaluator = func.time_evaluator(func.entry_name, ctx, number=1) print('Baseline: %f' % evaluator(a, b, c).mean) ################################################################################################
# Input placeholder tensors data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) weight = tvm.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype) # Copy buffers data_buf = tvm.compute(data_shape, lambda *i: data(*i), "data_buf") weight_buf = tvm.compute(weight_shape, lambda *i: weight(*i), "weight_buf") # Declare matrix multiply computation res_gemm = tvm.compute(output_shape, lambda bo, co, bi, ci: tvm.sum( data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype) * weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype), axis=[ic, ic_tns]), name="res_gem") # Add shift stage for fix-point normalization res_shr = tvm.compute(output_shape, lambda *i: res_gemm(*i) >> env.INP_WIDTH, name="res_shr") # Apply clipping between (0, input max value) inp_max = (1<<(env.INP_WIDTH-1))-1 res_max = tvm.compute(output_shape, lambda *i: tvm.max(res_shr(*i), 0), "res_max") res_min = tvm.compute(output_shape, lambda *i: tvm.min(res_max(*i), inp_max),
def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype): """Convolution operator in NCHW[x]c layout for int8. Parameters ---------- cfg: ConfigEntity The config for this template data : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] or 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] kernel : tvm.Tensor 4-D with shape [num_filter, in_channel, filter_height, filter_width] or 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, filter_width, num_filter_block, in_channel_block] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding: int or a list/tuple of two ints padding size, or [pad_height, pad_width] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] layout : str layout of data out_dtype : str The output type. This is used for mixed precision. Returns ------- output : tvm.Tensor 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ assert layout in ["NCHW", "NCHW4c"] ic_block_factor = 4 oc_block_factor = 4 pre_computed = len(kernel.shape) == 6 if not pre_computed: batch, channels, height, width = get_const_tuple(data.shape) assert channels % ic_block_factor == 0, \ "Number of input channels should be multiple of {}".format( ic_block_factor) packed_data = tvm.compute( (batch, channels // ic_block_factor, height, width, ic_block_factor), lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w], name="packed_data") out_channels, in_channels, kernel_h, kernel_w = get_const_tuple( kernel.shape) assert out_channels % 4 == 0, \ "Number of output channels should be multiple of {}".format( oc_block_factor) packed_kernel = tvm.compute( (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w, oc_block_factor, ic_block_factor), lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[ oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kh, kw], name="packed_kernel") else: packed_data = data packed_kernel = kernel batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple( packed_data.shape) oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple( packed_kernel.shape) if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") # compute the output shape out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 oshape = (batch, oc_chunk, out_height, out_width, oc_block) icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk') icb = tvm.reduce_axis((0, ic_block), name='ic_block') kh = tvm.reduce_axis((0, kernel_h), name='kh') kw = tvm.reduce_axis((0, kernel_w), name='kw') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(pad_data[n, icc, oh*stride_h+kh*dilation_h, \ ow*stride_w+kw*dilation_w, icb] .astype('int32') * packed_kernel[oc_chunk, icc, kh, kw, oc_block, icb] .astype('int32'), axis=[icc, kh, kw, icb])) output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: conv[ n, oc_chunk, oh, ow, oc_block].astype(out_dtype), tag="conv2d_NCHWc_int8") # num flop num_flop = batch * oc_chunk * oc_block * out_height * out_width * \ ic_chunk * ic_block * kernel_h * kernel_w * 2 cfg.add_flop(num_flop) return output
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed): """Compute declaration for winograd""" assert layout == 'NCHW' tile_size = _infer_tile_size(data, kernel) N, CI, H, W = get_const_tuple(data.shape) if not pre_computed: # kernel tensor is raw tensor, do strict check if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) CO, CI, KH, KW = get_const_tuple(kernel.shape) HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel) HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3 else: # kernel tensor is pre-transfomred. this op is created by # alter op layout, do not check # dilation is not supported HSTR = WSTR = 1 HPAD = WPAD = 1 KH = KW = 3 _, _, CI, CO = get_const_tuple(kernel.shape) data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array( [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], dtype=np.float32) B_data = np.array( [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2], [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]], np.float32) B_data = np.array( [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 H = (H + 2 * HPAD - KH) // HSTR + 1 W = (W + 2 * WPAD - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW # transform kernel if not pre_computed: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), name='r_kh') r_kw = tvm.reduce_axis((0, KW), name='r_kw') kernel_pack = tvm.compute( (alpha, alpha, CI, CO), lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps] [r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='kernel_pack') else: kernel_pack = kernel # pack input tile input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu: data_pad[p // (nH * nW)][c][ p // nW % nH * m + eps][p % nW * m + nu], name='d') # transform data B = const_matrix(B_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][ r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='data_pack') # do batch gemm ci = tvm.reduce_axis((0, CI), name='ci') bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][ ci][co] * data_pack[eps][nu][ci][p], axis=[ci]), name='bgemm') # inverse transform A = const_matrix(A_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') inverse = tvm.compute( (CO, P, m, m), lambda co, p, vh, vw: tvm.sum( bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='inverse') # output output = tvm.compute( (N, CO, H, W), lambda n, co, h, w: inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m], name='output', tag='conv2d_nchw_winograd') cfg.add_flop(2 * N * CO * H * W * CI * KH * KW) return output
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array( [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], dtype=np.float32) B_data = np.array( [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2], [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]], np.float32) B_data = np.array( [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 K = CO C = CI H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16) cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16) VP = cfg['tile_p'].size[-1] VK = cfg['tile_k'].size[-1] # pack input tile input_tile = tvm.compute( (C, P // VP, alpha, alpha, VP), lambda c, b, eps, nu, bb: data_pad[(b * VP + bb) // (nH * nW)][c][ (b * VP + bb) // nW % nH * m + eps][(b * VP + bb) % nW * m + nu], name='d') # transform kernel if pre_computed: U = kernel else: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute( (alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][ r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image B = const_matrix(B_data, 'B') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute( (alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][ bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V') # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b: tvm.sum(U[eps][nu][k // VK][c][ k % VK] * V[eps][nu][b // VP][c][b % VP], axis=c), name='M') # inverse transform A = const_matrix(A_data, 'A') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: tvm.sum(M[r_eps][r_nu][k][b] * A[ r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]), name='Y') # unpack output output = tvm.compute( (N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + (h // m) * nW + w // m][h % m][w % m], name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * K * H * W * KH * KW * C) return output
def gemm(env, mock=False): """Matrix-matrix multiply intrinsic Parameters ---------- env : Environment The Environment mock : bool Whether create a mock version. """ wgt_lanes = env.WGT_ELEM_BITS // env.WGT_WIDTH assert wgt_lanes == env.BLOCK_OUT * env.BLOCK_IN wgt_shape = (env.BLOCK_OUT, env.BLOCK_IN) assert wgt_shape[0] * wgt_shape[1] == wgt_lanes inp_lanes = env.INP_ELEM_BITS // env.INP_WIDTH assert inp_lanes == env.BATCH * env.BLOCK_IN inp_shape = (env.BATCH, env.BLOCK_IN) assert inp_shape[0] * inp_shape[1] == inp_lanes out_lanes = env.ACC_ELEM_BITS // env.ACC_WIDTH assert out_lanes == env.BATCH * env.BLOCK_OUT out_shape = (env.BATCH, env.BLOCK_OUT) assert out_shape[0] * out_shape[1] == out_lanes wgt = tvm.placeholder((wgt_shape[0], wgt_shape[1]), dtype="int%d" % env.WGT_WIDTH, name=env.wgt_scope) inp = tvm.placeholder((inp_shape[0], inp_shape[1]), dtype="int%d" % env.INP_WIDTH, name=env.inp_scope) k = tvm.reduce_axis((0, wgt_shape[1]), name="k") out_dtype = "int%d" % env.ACC_WIDTH out = tvm.compute((out_shape[0], out_shape[1]), lambda i, j: tvm.sum(inp[i, k].astype(out_dtype) * wgt[ j, k].astype(out_dtype), axis=[k]), name="out") wgt_layout = tvm.decl_buffer(wgt.shape, wgt.dtype, env.wgt_scope, scope=env.wgt_scope, offset_factor=wgt_lanes, data_alignment=wgt_lanes) inp_layout = tvm.decl_buffer(inp.shape, inp.dtype, env.inp_scope, scope=env.inp_scope, offset_factor=inp_lanes, data_alignment=inp_lanes) out_layout = tvm.decl_buffer(out.shape, out.dtype, env.acc_scope, scope=env.acc_scope, offset_factor=out_lanes, data_alignment=out_lanes) def intrin_func(ins, outs): """Matrix-matrix multiply intrinsic function""" dinp, dwgt = ins dout = outs[0] def instr(index): """Generate matrix-matrix multiply VTA instruction""" irb = tvm.ir_builder.create() dev = env.dev irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) if index in (0, 2): irb.emit( tvm.call_extern("int32", "VTAUopPush", 0, 0, dout.access_ptr("rw", "int32"), dinp.access_ptr("r", "int32"), dwgt.access_ptr("r", "int32"), 0, 0, 0)) else: irb.emit( tvm.call_extern("int32", "VTAUopPush", 0, 1, dout.access_ptr("rw", "int32"), 0, 0, 0, 0, 0)) return irb.get() # return a triple of normal-set, reset, update nop = tvm.make.Evaluate(0) if mock: return (nop, nop, nop) return (instr(0), instr(1), instr(2)) return tvm.decl_tensor_intrin(out.op, intrin_func, name="GEMM", binds={ inp: inp_layout, wgt: wgt_layout, out: out_layout })
def rnn_matexp(): n_num_step = 128 n_num_hidden = 1152 n_batch_size = 4 detect_global_barrier = DETECT_GLOBAL_BARRIER num_step = tvm.var("num_step") num_hidden = tvm.convert(n_num_hidden) batch_size = tvm.convert(n_batch_size) num_thread_y = 8 num_thread_x = 16 * 3 num_sm = 24 Whh = tvm.placeholder((num_hidden, num_hidden), name="Whh") s_init = tvm.compute((1, batch_size, num_hidden), lambda _, i, j: 1.0, name="init") s_state = tvm.placeholder((num_step, batch_size, num_hidden)) kh = tvm.reduce_axis((0, num_hidden), name="kh") s_update = tvm.compute( (num_step, batch_size, num_hidden), lambda t, i, j: tvm.sum(s_state[t-1, i, kh] * Whh[kh, j], axis=kh), name="update") s_scan = tvm.scan(s_init, s_update, s_state) # schedule s = tvm.create_schedule(s_scan.op) CL = s_update SS = s.cache_read(s_state, "shared", [CL]) SL = s.cache_read(SS, "local", [CL]) WhhL = s.cache_read(Whh, "local", [CL]) ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y) CLF = s.rfactor(CL, ko) block_x = tvm.thread_axis((0, num_sm), "blockIdx.x") thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x") thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y") if PERSIST_KERNEL: s[s_scan.op].env_threads([block_x, thread_y, thread_x]) bx, xi = s[s_init].split(s_init.op.axis[2], nparts=num_sm) tx, xi = s[s_init].split(xi, nparts=num_thread_x) s[s_init].bind(bx, block_x) s[s_init].bind(tx, thread_x) bx, xi = s[s_update].split(s[CL].op.axis[2], nparts=num_sm) tx, xi = s[s_update].split(xi, nparts=num_thread_x) s[s_update].bind(bx, block_x) s[s_update].bind(tx, thread_x) s[CL].bind(s[CL].op.reduce_axis[0], thread_y) s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0]) # Duplicate store predicate. s[CL].set_store_predicate(thread_y.equal(0)) if PERSIST_KERNEL: s[WhhL].compute_at(s[s_scan], thread_x) s[WhhL].unroll(WhhL.op.axis[0]) else: s[WhhL].compute_at(s[CLF], CLF.op.axis[3]) kr, ki = s[CLF].split(CLF.op.reduce_axis[0], nparts=1) ko, ki = s[CLF].split(ki, factor=4) s[SS].compute_at(s[CLF], kr) s[SL].compute_at(s[CLF], ko) xo, xi = s[SS].split(SS.op.axis[2], factor=num_thread_x * num_thread_y * 3) ty, xi = s[SS].split(xi, nparts=num_thread_y) tx, xi = s[SS].split(xi, nparts=num_thread_x) s[SS].bind(ty, thread_y) s[SS].bind(tx, thread_x) def check_device(target): with tvm.build_config( detect_global_barrier=detect_global_barrier, auto_unroll_min_depth=2, auto_unroll_max_step=128, unroll_explicit=False): f = tvm.build(s, [s_scan, Whh], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden//2:] = 0 res_a = tvm.nd.array(res_np, ctx) Whh_a = tvm.nd.array(Whh_np, ctx) # Skip first pass as it is compilation f(res_a, Whh_a) ctx.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5: print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j])) np.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3) check_device("cuda")
def test_gemm(): # graph nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) s[AA].double_buffer() s[BB].double_buffer() ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) # lowering test s = s.normalize() # one line to build the function. def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("skip because %s is not enabled.." % device) return with tvm.target.create(device): f = tvm.build(s, [A, B, C]) # launch the kernel. n = nn m = n l = n a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) ftimer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean print("%s: exec=%g sec/op" % (ctx, tcost)) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("vulkan") check_device("nvptx -mcpu=sm_20") check_device("rocm") check_device("metal") check_device("opencl") check_device("cuda")
def test_gemm(): # graph nn = 2048 n = tvm.var('n') n = tvm.convert(nn) m, l = n, n A = tvm.placeholder((l, n), name='A') B = tvm.placeholder((l, m), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute( (m, n), lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k), name='C') # schedule s = tvm.create_schedule(C.op) AA = s.cache_read(A, "shared", [C]) BB = s.cache_read(B, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_thread) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis ko, ki = s[CC].split(k, factor=8) kt, ki = s[CC].split(ki, factor=1) s[CC].reorder(ko, kt, ki, yo, xo) s[AA].compute_at(s[CC], ko) s[BB].compute_at(s[CC], ko) s[AL].compute_at(s[CC], kt) s[BL].compute_at(s[CC], kt) # Schedule for A's shared memory load ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) s[AA].vectorize(xi) # Schedule for B' shared memory load ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].vectorize(xi) s[AA].double_buffer() s[BB].double_buffer() # correctness def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) for i in range(2): f(a, b, c) np.testing.assert_allclose( c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) with tvm.build_config(auto_unroll_max_step=32, auto_unroll_min_depth=0, unroll_explicit=False): check_device("cuda")
import tvm n = 1024 m = 1024 A = tvm.placeholder((n, m), name='A') l = tvm.reduce_axis((0, m), name='l') B = tvm.compute((n, ), lambda i: tvm.sum(A[i, l], axis=l), name='B') s = tvm.create_schedule(B.op) print(tvm.lower(s, [A, B], simple_mode=True)) print("---------cutting line---------") s[B].parallel(B.op.reduce_axis[0]) print(tvm.lower(s, [A, B], simple_mode=True))
def _compute_expsum(max_elem, *indices): eval_range = insert_reduce_index(indices, k2) return tvm.sum(tvm.exp(x[eval_range] - max_elem[indices]), axis=k2)
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size=2): """Declare a winograd convolution - only tile_size=2 is currently supported""" N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if int(kernel.shape[2]) == 3: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape) KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) K = CO C = CI H = (IH + pt + pb - 3) // HSTR + 1 W = (IW + pl + pr - 3) // WSTR + 1 nH, nW = (H + m-1) // m, (W + m-1) // m P = N * nH * nW def upround(x, align): return (x + align - 1) // align * align ALIGN = 16 P_round = upround(P, ALIGN) K_round = upround(K, ALIGN) # CONFIG cfg.define_knob("data_transform_wgx", [1, 2, 4, 8, 16, 32, 64]) cfg.define_knob("data_transform_wgy", [1, 2, 4, 8, 16, 32, 64]) # Pack input tile input_tile = tvm.compute((N, C, H + 2, W + 2), lambda n, c, h, w: data_pad[n][c][h][w], name='d') if pre_computed: U = kernel else: U = _decl_winograd_kernel_transform(kernel, tile_size, G) # V [alpha * alpha, C, P_round) # Perform the image transform r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute((alpha * alpha, C, P_round), lambda epsnu, c, b: tvm.sum(input_tile[b // (nH*nW)][c][b // nW % nH * m + r_eps][b % nW * m +r_nu]\ * B[r_eps][epsnu // alpha] * B[r_nu][epsnu % alpha], axis=[r_eps, r_nu]), name='V') # Winograd GEMM is a wrapper around batched GEMM to convert U to a 3D Tensor _, M = decl_winograd_gemm(cfg, U, V) # Y [K, P, m, m] # Winograd output transform r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: tvm.sum(M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]), name='Y') # Output [N, K, H, W] # Unpack back to NCHW format # The last term ensures alignment is not lost to bound inference output = tvm.compute((N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m] + tvm.const(0, out_dtype) * M[(alpha*alpha)-1][K_round-1][P_round-1], name='output', tag='winograd_conv2d_output') return output
def conv_auto_tuned(ofmblock,ofw, ifmblock, stride_width,input_width,\ in_channel,input_height, filter_height, filter_width,ofh, stride_height, batch, out_channel): A1 = tvm.placeholder((batch, math.ceil( in_channel / ifmblock), input_height, input_width, ifmblock), name='input') W1 = tvm.placeholder( (math.ceil(out_channel / ofmblock), math.ceil(in_channel / ifmblock), filter_height, filter_width, ifmblock, ofmblock), name='weight') rco1 = tvm.reduce_axis((0, math.ceil(in_channel / ifmblock)), name='rco1') ry1 = tvm.reduce_axis((0, filter_height), name='ry1') rx1 = tvm.reduce_axis((0, filter_width), name='rx1') rci1 = tvm.reduce_axis((0, ifmblock), name='rci1') cfg = autotvm.get_config() cfg.define_knob("pack", [0, 1]) pack = False w_tile = [] factor_found = False for i in range(6, min(ofw + 1, 29)): if ofw % i == 0: w_tile.append((i, ofw // i)) factor_found = True if factor_found == False: w_tile.append((ofw, 1)) #tile factors for output width cfg.define_knob("tile_w", w_tile) # pack data when stride > 1 and pack flag set so that data for brgemm is continuous if filter_height == 1 and filter_width == 1 and stride_width > 1 and stride_height > 1 and cfg[ 'pack'].val == 1: A2 = tvm.compute( (batch, math.ceil(in_channel / ifmblock), ofh, ofw, ifmblock), lambda n, c, h, w, vlen1: A1[n, c, h * stride_height, w * stride_width, vlen1]) B1 = tvm.compute( (batch, math.ceil(out_channel / ofmblock), ofh, ofw, ofmblock), lambda nn, ff, yy, xx, vlen1: tvm.sum(W1[ ff, rco1, ry1, rx1, rci1, vlen1] * A2[nn, rco1, ry1 + yy, rx1 + xx, rci1], axis=[rco1, ry1, rx1, rci1]), name='output') pack = True else: # Compute the convolution B1 = tvm.compute( (batch, math.ceil(out_channel / ofmblock), ofh, ofw, ofmblock), lambda nn, ff, yy, xx, vlen1: tvm.sum( W1[ff, rco1, ry1, rx1, rci1, vlen1 ] * A1[nn, rco1, ry1 + stride_height * yy, rx1 + stride_width * xx, rci1], axis=[rco1, ry1, rx1, rci1]), name='output') s = tvm.create_schedule(B1.op) n, ko, h, w, ki = s[B1].op.axis rco, ry, rx, rci = s[B1].op.reduce_axis cfg.define_split("tile_h", h, num_outputs=3) #output height cfg.define_split("tile_c", rco, num_outputs=2) #input channel dimension cfg.define_split("tile_k", ko, num_outputs=2) #output channel dimension w_factor_inner, _ = cfg["tile_w"].val wo, wi = s[B1].split(w, w_factor_inner) #tiling rco_o, rco_i = cfg["tile_c"].apply(s, B1, rco) ko_o, ko_i = cfg["tile_k"].apply(s, B1, ko) ho, hm, hi = cfg["tile_h"].apply(s, B1, h) s[B1].reorder(n, ko_o, ho, ko_i, rco_o, hm, wo, hi, rco_i, ry, rx, wi, ki, rci) cfg.define_reorder("reorder_outer", [ko_i, rco_o, hm, wo], policy="all") cfg.add_flop( np.prod(get_const_tuple(B1.shape)) * in_channel * filter_height * filter_width * 2) cfg["reorder_outer"].apply(s, B1, [ko_i, rco_o, hm, wo]) if (filter_height == 1 and filter_width == 1 and stride_width == 1 and stride_height == 1) or pack: if cfg["tile_h"].size[ 1] > 1 and w_factor_inner == ofw: #cfg["tile_w"].size[2] == ofw: libxsmm_tensorize = intrin_libxsmm_hxw(ofmblock,w_factor_inner,ifmblock, 1, w_factor_inner, cfg["tile_c"].size[1],cfg["tile_h"].size[2],\ filter_height, filter_width,ofh,ofw,cfg["tile_h"].size[2],1, out_channel, ofh,ofw, in_channel) s[B1].tensorize(hi, libxsmm_tensorize) else: libxsmm_tensorize = intrin_libxsmm_tuned(ofmblock,w_factor_inner,ifmblock, 1, w_factor_inner, cfg["tile_c"].size[1], cfg["tile_h"].size[2],\ filter_height, filter_width,ofh, ofw, in_channel) s[B1].tensorize(rco_i, libxsmm_tensorize) else: libxsmm_tensorize = intrin_libxsmm_tuned(ofmblock,w_factor_inner,ifmblock, stride_width, w_factor_inner,\ cfg["tile_c"].size[1], cfg["tile_h"].size[2],\ filter_height, filter_width,input_height,input_width, in_channel) s[B1].tensorize(rco_i, libxsmm_tensorize) par = s[B1].fuse(n, ko_o, ho) s[B1].parallel(par) if pack: n1, c1, h1, w1, v1 = s[A2].op.axis par2 = s[A2].fuse(n1, c1, h1) s[A2].parallel(par) s[A2].vectorize(v1) s = s.normalize() return s, [W1, A1, B1]
# === Start computation N = tvm.var('N') # Data set size D = tvm.var('D') # Feature number L = tvm.var('L') # Label number label = tvm.placeholder((N, L), name='label') data = tvm.placeholder((N, D), name='data') weight = tvm.placeholder((L, D + 1), name='weight') data_expand = tvm.compute((N, D + 1), lambda n, d: tvm.select((d < D), data[n, d], tvm.const(1, dtype=data.dtype)), name='data_expand') rd = tvm.reduce_axis((0, D + 1), name='rd') dot = tvm.compute((N, L), lambda n, l: tvm.sum(weight[l, rd] * data_expand[n, rd], axis=rd), name='dot') factor = tvm.compute((N, L), lambda n, l: 1 / (1 + tvm.exp(-dot[n, l])), name='factor') def argmax_combine(x, y): lhs = tvm.select((x[1] > y[1]), x[0], y[0]) rhs = tvm.select((x[1] > y[1]), x[1], y[1]) return lhs, rhs def argmax_identity(t0, t1): return tvm.const(-1, t0), tvm.min_value(t1) argmax = tvm.comm_reducer(argmax_combine, argmax_identity, name='argmax') dummy_idx = tvm.compute((L, ), lambda l: l, name='dummy_idx')
def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None): """Depthwise convolution nhwc forward operator. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] Filter : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, channel_multiplier] stride : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype: str, optional Output data type Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ out_dtype = Input.dtype if out_dtype is None else out_dtype if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if dilation_h != 1 or dilation_w != 1: Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1)) batch, in_height, in_width, in_channel = Input.shape # shape of dilated kernel filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (filter_height, filter_width)) out_channel = simplify(in_channel * channel_multiplier) out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1) # padding stage pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") # depthconv stage di = tvm.reduce_axis((0, filter_height), name='di') dj = tvm.reduce_axis((0, filter_width), name='dj') Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (PaddedInput[b, i*stride_h + di, j*stride_w + dj, c/channel_multiplier].astype( out_dtype) * Filter[di, dj, c/channel_multiplier, c%channel_multiplier].astype(out_dtype)), axis=[di, dj]), name='DepthwiseConv2d', tag="depthwise_conv2d_nhwc") return Output