def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_x", out_dim, num_outputs=2) cfg.define_split("tile_y", batch, num_outputs=2) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_nopack_config(cfg, batch, out_dim, in_dim) vec = cfg["tile_k"].size[-1] k = tvm.reduce_axis((0, in_dim // vec), "k") CC = tvm.compute((batch, out_dim, vec), lambda z, y, x: tvm.sum( data[z, k * vec + x].astype(out_dtype) * weight[y, k * vec + x].astype(out_dtype), axis=k)) kk = tvm.reduce_axis((0, vec), "kk") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), tag="dense_nopack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def packed_conv2d(data, kernel, padding, strides, out_dtype="int32"): """ Packed conv2d function. """ if padding[0]: pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") else: pad_data = data assert len(data.shape) == 6 assert len(kernel.shape) == 6 oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1) owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1) oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4]) ishape = topi.util.get_const_tuple(data.shape) kshape = topi.util.get_const_tuple(kernel.shape) assert data.dtype == "int8", data.dtype assert kernel.dtype == "int8", kernel.dtype d_i = tvm.reduce_axis((0, kshape[2]), name='d_i') d_j = tvm.reduce_axis((0, kshape[3]), name='d_j') k_o = tvm.reduce_axis((0, ishape[1]), name='k_o') k_i = tvm.reduce_axis((0, ishape[-1]), name='k_i') hstride, wstride = strides res = tvm.compute( oshape, lambda b_o, c_o, i, j, b_i, c_i: tvm.sum( pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) * kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype), axis=[k_o, d_i, d_j, k_i]), name="res", tag="packed_conv2d") return res
def test_conv_tiling(): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.LoopPartition(stmt, True) stmt = tvm.ir_pass.Simplify(stmt) assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype)) roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32') else: roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32') roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32') count = roi_bin_grid_h * roi_bin_grid_w rh = tvm.reduce_axis((0, roi_bin_grid_h)) rw = tvm.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w return tvm.sum(_bilinear(batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count, axis=[rh, rw])
def test_lstm_cell_inline(): num_step = 128 num_input = 256 num_hidden = 1152 batch_size = 4 # Global transition matrix X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X") Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h") Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.placeholder((num_step, batch_size, num_hidden)) s_state_c = tvm.placeholder((num_step, batch_size, num_hidden)) s_init_c = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") s_init_h = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # LSTM transition k = tvm.reduce_axis((0, num_input), name="ki2h") s_i2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k), name="s_i2h") k = tvm.reduce_axis((0, num_hidden), name="ki2h") s_h2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules gates = tvm.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates") gshape = (num_step, batch_size, num_hidden) in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate") in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform") forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate") next_c = tvm.compute(gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") next_h = tvm.compute(gshape, lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c") update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h") # schedule scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[X], name="lstm_scan") # schedule s = tvm.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() # verify we can lower correctly tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
def test_rfactor(): n = tvm.var('n') k1 = tvm.reduce_axis((0, n), name="k1") k2 = tvm.reduce_axis((0, n), name="k2") A = tvm.placeholder((n, n, n), name='A') B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2])) # normal schedule s = tvm.create_schedule(B.op) BF = s.rfactor(B, k1) assert(tuple(BF.shape) == (n, n)) assert(set(BF.op.body[0].axis) == set([k2])) assert(s[B].op.body[0].axis[0].dom.extent == n) assert(len(s[B].all_iter_vars) == 2) # schedule with splot s = tvm.create_schedule(B.op) ko, ki = s[B].split(k1, factor=4) xo, xi = s[B].split(B.op.axis[0], factor=8) BF = s.rfactor(B, ki) assert(BF.shape[0].value == 4) assert(BF.shape[1] == n) assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4) # schedule with factor_axis s = tvm.create_schedule(B.op) ko, ki = s[B].split(k1, factor=4) xo, xi = s[B].split(B.op.axis[0], factor=8) BF = s.rfactor(B, ki, 1) assert(n == BF.shape[0]) assert(BF.shape[1].value == 4) assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
def test_in_bounds_conv_llvm(loop_tiling=False): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis if loop_tiling: oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) print (lowered_func.body) ctx = tvm.cpu (0) f = tvm.build(s, [data, kernel, conv], "llvm") data_input = tvm.nd.array(np.random.uniform( size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx) kernel_input = tvm.nd.array(np.random.uniform( size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx) conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx) f(data_input, kernel_input, conv_out)
def global_pool(data, pool_type): """Perform global pooling on the data Parameters ---------- data : tvm.Tensor 4-D with shape [batch, channel, in_height, in_width] pool_type : str Pool type, 'max' or 'avg' Returns ------- output : tvm.Tensor 4-D with shape [batch, channel, 1, 1] """ assert len(data.shape) == 4, "only support 4-dim pooling" batch, channel, height, width = data.shape dheight = tvm.reduce_axis((0, height)) dwidth = tvm.reduce_axis((0, width)) if pool_type == 'max': return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \ tag="global_pool_max") elif pool_type == 'avg': tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \ tag="global_pool_sum") return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \ tag=tag.ELEMWISE) else: raise ValueError("Pool type should be 'avg' or 'max'.")
def _spatial_pack(data, kernel, stride, padding, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = KH-1, KW-1 VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll TH = H + 2*HPAD TW = W + 2*WPAD OH = (H + 2*HPAD - KH) // HSTR + 1 OW = (W + 2*WPAD - KW) // WSTR + 1 dshape = (1, CI, H, W) dpshape = (1, CI, TH, TW) dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT) kshape = (CO, CI, KH, KW) kvshape = (CO/VC, CI, KH, KW, VC) ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (1, CO, OH, OW) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \ data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \ kernel[co*VC+vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) * kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC], name='output_unpack', tag='spatial_conv_output') return output
def test_tensor_reduce_multi_axis(): m = tvm.var('m') n = tvm.var('n') A = tvm.placeholder((m, n), name='A') k1 = tvm.reduce_axis((0, n), "k") k2 = tvm.reduce_axis((0, m), "k") C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2))) C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
def test_verify_compute(): n = tvm.var("n") m = tvm.var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") k_ = tvm.reduce_axis((0, m-1), "k_") f1 = lambda i: tvm.sum(A[i, k], axis=k) f2 = lambda i: A[i,0] + 1 f3 = lambda i: tvm.sum(A[i, k], axis=k) + 1 f4 = lambda i: A[i,0] * (tvm.sum(A[i, k], axis=k) + 1) f5 = lambda i: (tvm.sum(A[i, k], axis=k), A[i,0] + 1) f6 = lambda i: (tvm.sum(A[i, k], axis=k), tvm.sum(A[i, k_], axis=k_)) # # Valid compute try: B = tvm.compute((n,), f1, name="B") except tvm._ffi.base.TVMError as ex: assert False # # Valid compute try: B = tvm.compute((n,), f2, name="B") except tvm._ffi.base.TVMError as ex: assert False # # Invalid compute with non top level reduction try: B = tvm.compute((n,), f3, name="B") assert False except tvm._ffi.base.TVMError as ex: pass # # Invalid compute with non top level reduction try: B = tvm.compute((n,), f4, name="B") assert False except tvm._ffi.base.TVMError as ex: pass # # Invalid compute with reduction and non-reduction batch ops try: B0, B1 = tvm.compute((n,), f5, name="B") assert False except tvm._ffi.base.TVMError as ex: pass # # Invalid compute with unequal batch reduction ops try: B0, B1 = tvm.compute((n,), f6, name="B") assert False except tvm._ffi.base.TVMError as ex: pass
def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] out_dtype : str The output data type. This is used for mixed precision. Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_c, in_h, in_w = Input.shape _, out_c, filter_h, filter_w = Filter.shape stride_h, stride_w = strides # dilate stage DilatedInput = dilate(Input, [1, 1, stride_h, stride_w], name='DilatedInput') # padding stage fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w)) bpad_top = filter_h - 1 - fpad_top bpad_bottom = filter_h - 1 - fpad_bottom bpad_left = filter_w - 1 - fpad_left bpad_right = filter_w - 1 - fpad_right PaddedInput = pad(DilatedInput, \ [0, 0, bpad_top, bpad_left], \ [0, 0, bpad_bottom, bpad_right], \ name='PaddedInput') # convolution stage out_c = simplify(out_c) out_h = simplify((in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h) out_w = simplify((in_w - 1) * stride_w - fpad_left - fpad_right + filter_w) dc = tvm.reduce_axis((0, in_c), name='dc') dh = tvm.reduce_axis((0, filter_h), name='dh') dw = tvm.reduce_axis((0, filter_w), name='dw') Output = tvm.compute( (batch, out_c, out_h, out_w), lambda b, c, h, w: tvm.sum( PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) * Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return Output
def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, stride, padding): """Depthwise convolution nhwc backward wrt input operator. Parameters ---------- Filter : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, channel_multiplier] Out_grad : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] stride : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] Returns ------- Output : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] """ batch, in_h, in_w, in_c = ishape _, out_h, out_w, out_c = oshape filter_h, filter_w, _, channel_multiplier = Filter.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name='dilated_out_grad') # padding params in forward propagation fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w)) # padding params in backward propagation bpad_top = filter_h - 1 - fpad_top bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1) bpad_left = filter_w - 1 - fpad_left bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1) padded_out_grad = pad(dilated_out_grad, \ [0, bpad_top, bpad_left, 0], \ [0, bpad_bottom, bpad_right, 0], \ name='padded_out_grad') dh = tvm.reduce_axis((0, filter_h), name='dh') dw = tvm.reduce_axis((0, filter_w), name='dw') dc = tvm.reduce_axis((0, channel_multiplier), name='dc') In_grad = tvm.compute( (batch, in_h, in_w, in_c), lambda b, h, w, c: tvm.sum(padded_out_grad[b, h+dh, w+dw, c*channel_multiplier + dc] * \ Filter[filter_h-1-dh, filter_w-1-dw, c, dc], axis=[dh, dw, dc]), tag='depthwise_conv2d_backward_input_nhwc') return In_grad
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") sch = _get_schedule(wkl, "NHWC") VH = sch.vh VW = sch.vw VC = sch.vc data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) N, H, W, IB, CI = data_q.shape OCO, KH, KW, KB, VC, _ = kernel_vec.shape CO = OCO * VC HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH-1, KW-1 PAD_H = H + 2*HPAD PAD_W = W + 2*WPAD OH = (H + 2*HPAD - KH) // HSTR + 1 OW = (W + 2*WPAD - KW) // WSTR + 1 dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI) ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) oshape = (1, OH, OW, CO) if (HPAD != 0 and WPAD != 0): data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") else: data_pad = data_q data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') ib = tvm.reduce_axis((0, IB), name='ib') kb = tvm.reduce_axis((0, KB), name='kb') def _conv(n, h, w, co, vh, vw, vc): return tvm.sum((tvm.popcount( kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') & data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16')) << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) conv = tvm.compute(ovshape, _conv, name='conv') return tvm.compute(oshape, lambda n, h, w, co: conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), name='output_vec', tag='spatial_bitserial_conv_nhwc')
def test_reduce_simplify(): ck = CanonicalChecker() k = tvm.reduce_axis((0, 10), name="k") j = tvm.reduce_axis((-5, 3), name="j") A = tvm.placeholder((10,), name='A') ck.verify(tvm.sum(tvm.expr.Select(k + j < 12, k + j, 0), [k, j]), tvm.sum(k + j, [k, j])) ck.verify(tvm.sum(A[3], []), A[3]) # The rule below is not typical, removed for now ck.verify(tvm.sum(k / 10, k), tvm.sum(tvm.const(0, "int32"), k))
def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'): """Depthwise convolution nchw forward operator. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, channel_multiplier, filter_height, filter_width] stride : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ out_dtype = Input.dtype batch, in_channel, in_height, in_width = Input.shape filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (filter_height, filter_width)) out_channel = simplify(in_channel * channel_multiplier) out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1) # padding stage pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") # depthconv stage di = tvm.reduce_axis((0, filter_height), name='di') dj = tvm.reduce_axis((0, filter_width), name='dj') Output = tvm.compute( (batch, out_channel, out_height, out_width), lambda b, c, i, j: tvm.sum( (PaddedInput[b, c/channel_multiplier, i*stride_h+di, j*stride_w+dj].astype(out_dtype) * Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)), axis=[di, dj]), name='DepthwiseConv2d', tag="depthwise_conv2d_nchw") return Output
def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype=None): out_dtype = data.dtype if out_dtype is None else out_dtype batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape) out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \ = get_const_tuple(kernel.shape) strides = strides if isinstance(strides, (tuple, list)) else (strides, strides) HSTR, WSTR = strides pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (filter_height, filter_width)) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) assert (dh, dw) == (1, 1), "Does not support dilation" in_channel = in_channel_chunk * in_channel_block out_channel = out_channel_chunk * out_channel_block channel_multiplier = out_channel // in_channel out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1 out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1 # get workload and related schedule config wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype), tvm.placeholder((out_channel, in_channel, filter_height, filter_width), dtype=kernel.dtype), strides, padding, out_dtype) if cfg.is_fallback: _fallback_schedule(cfg, wkl) # padding stage DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0) if DOPAD: pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] data_pad = pad(data, pad_before, pad_after, name="PaddedInput") else: data_pad = data # depthconv stage kh = tvm.reduce_axis((0, filter_height), name='kh') kw = tvm.reduce_axis((0, filter_width), name='kw') Output = tvm.compute( (batch, out_channel_chunk, out_height, out_width, out_channel_block), lambda b, oco, oh, ow, oci: tvm.sum( (data_pad[b, (oco * out_channel_block + oci) // channel_multiplier // in_channel_block, oh*HSTR+kh, ow*WSTR+kw, ((oco * out_channel_block + oci) // channel_multiplier) % in_channel_block] .astype(out_dtype) * kernel[oco, 0, kh, kw, 0, oci].astype(out_dtype)), axis=[kh, kw]), name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc") return Output
def compute_conv(data, weight): N, IC, H, W = data.shape OC, IC, KH, KW = weight.shape OH = H - KH + 1 OW = W - KW + 1 ic = tvm.reduce_axis((0, IC), name='ic') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') return tvm.compute((N, OC, OH, OW), lambda i, oc, h, w: \ tvm.sum(data[i, ic, h+dh, w+dw] * weight[oc, ic, dh, dw], axis=[ic, dh, dw]))
def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'): """Convolution operator in NHWC layout. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] Filter : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] Returns ------- output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ assert isinstance(stride, int) or len(stride) == 2 batch, in_height, in_width, in_channel = Input.shape kernel_h, kernel_w, channel, num_filter = Filter.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) # compute the output shape out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( PaddedInput[nn, yy * stride_h + ry, xx * stride_w + rx, rc].astype(out_dtype) * Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput", tag="conv2d_nhwc") return Output
def conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'): """Convolution operator in NCHW layout. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [num_filter, in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ assert isinstance(stride, int) or len(stride) == 2 batch, in_channel, in_height, in_width = Input.shape num_filter, channel, kernel_h, kernel_w = Filter.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) # compute the output shape out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(Input, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') return tvm.compute( (batch, out_channel, out_height, out_width), lambda nn, ff, yy, xx: tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * Filter[ff, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx]), tag="conv2d_nchw")
def conv2d_winograd_weight_transform(kernel, tile_size): """Weight transformation for winograd Parameters ---------- kernel: Tensor The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now tile_size: int Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3) Returns ------- output : tvm.Tensor 4-D with shape [alpha, alpha, CO, CI] """ K = 3 shape = get_const_tuple(kernel.shape) assert shape[2:] == (K, K), "Only support 3x3 kernel" r = tile_size + K - 1 shape = (r, r) + shape[:2] if tile_size == 2: G_data = np.array([ [1, 0, 0], [1.0/2, 1.0/2, 1.0/2], [1.0/2, -1.0/2, 1.0/2], [0, 0, 1], ], dtype=kernel.dtype) elif tile_size == 4: G_data = np.array([ [1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1] ], dtype=kernel.dtype) else: raise ValueError("Unsupoorted tile size:" + tile_size) G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, K), name='r_kh') r_kw = tvm.reduce_axis((0, K), name='r_kw') return tvm.compute(shape, lambda eps, nu, co, ci: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='transform_weight')
def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, stride, padding): """Depthwise convolution nhwc backward wrt weight operator. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] Out_grad : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] stride : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] Returns ------- Output : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, channel_multiplier] """ batch, out_h, out_w, out_c = oshape filter_h, filter_w, _, channel_multiplier = fshape in_c = Input.shape[3].value if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w)) padded_in = pad(Input, \ [0, pad_top, pad_left, 0], \ [0, pad_bottom, pad_right, 0], \ name='padded_in') dh = tvm.reduce_axis((0, Out_grad.shape[1].value), name='dh') dw = tvm.reduce_axis((0, Out_grad.shape[2].value), name='dw') db = tvm.reduce_axis((0, batch), name='db') Weight_grad = tvm.compute( (filter_h, filter_w, in_c, channel_multiplier), lambda fh, fw, c, m: tvm.sum( Out_grad[db, dh, dw, c*channel_multiplier+m%channel_multiplier] * padded_in[db, fh+dh*stride_h, fw+dw*stride_w, c], axis=[db, dh, dw]), tag='depthwise_conv2d_backward_weight_nhwc') return Weight_grad
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_y", batch, num_outputs=3) cfg.define_split("tile_x", out_dim, num_outputs=3) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_pack_config(cfg, batch, out_dim, in_dim) packw_bn = cfg["tile_x"].size[-1] packw_shape = (out_dim // packw_bn, in_dim, packw_bn) packw = tvm.compute(packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") k = tvm.reduce_axis((0, in_dim), name="k") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum( data[y, k].astype(out_dtype) * packw[x // packw_bn, k, x % packw_bn].astype(out_dtype), axis=k), tag="dense_pack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def dense_default(data, weight, bias=None): """The default implementation of dense in topi. Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim] weight : tvm.Tensor 2-D with shape [out_dim, in_dim] bias : tvm.Tensor, optional 1-D with shape [out_dim] Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim] """ assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim dense" if bias is not None: assert len(bias.shape) == 1 batch, in_dim = data.shape out_dim, _ = weight.shape k = tvm.reduce_axis((0, in_dim), name='k') matmul = tvm.compute((batch, out_dim), \ lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \ tag='dense') if bias is not None: matmul = tvm.compute((batch, out_dim), \ lambda i, j: matmul[i, j] + bias[j], \ tag=tag.BROADCAST) return matmul
def test_dot(): nn = 12 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') k = tvm.reduce_axis((0, n), 'k') C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C') s = tvm.create_schedule(C.op) fapi = lower(s, [A, B, C]) def verify(target): if not tvm.module.enabled(target): print("Target %s is not enabled" % target) return f = tvm.codegen.build_module(fapi, target) # verify ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4) verify("llvm")
def intrin_gemv(m, l): a = tvm.placeholder((l,), name='a') b = tvm.placeholder((m, l), name='b') k = tvm.reduce_axis((0, l), name='k') c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c') Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[tvm.var("s1"), 1]) Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) def intrin_func(ins, outs): ib = tvm.ir_builder.create() aa, bb = ins cc = outs[0] ib.emit(tvm.call_extern("int32", "gemv_update", cc.access_ptr("w"), aa.access_ptr("r"), bb.access_ptr("r"), m, l, bb.strides[0])) return ib.get() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def matmul(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def test_rfactor(): n = tvm.convert(1027) A = tvm.placeholder((n,), name='A') k = tvm.reduce_axis((0, n)) B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B') # schedule s = tvm.create_schedule(B.op) kf, ki = s[B].split(k, nparts=4) BF = s.rfactor(B, kf) s[BF].parallel(BF.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.module.enabled(target): return ctx = tvm.cpu(0) fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose( b.asnumpy(), res, rtol=1e-4) check_target()
def intrin_gemv(m, n): w = tvm.placeholder((m, n), name='w') x = tvm.placeholder((n,), name='x') k = tvm.reduce_axis((0, n), name='k') z = tvm.compute((m,), lambda i: tvm.sum(w[i, k] * x[k], axis=k), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=16, strides=[tvm.var('ldw'), 1]) def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed( "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) reset = tvm.call_packed( "fill_zero", zz_ptr, n) update = tvm.call_packed( "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, reset, update with tvm.build_config(data_alignment=16, offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb})
def get_gemm_feature(target): k = tvm.reduce_axis((0, N), 'k') A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='B') C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k), name='C') s = tvm.create_schedule(C.op) y, x = s[C].op.axis axes = list(s[C].tile(y, x, 8, 8)) + [k] perm = np.random.permutation(5) axes = [axes[x] for x in perm] s[C].reorder(*axes) if "gpu" in target.keys: pick = [] # filter out reduction axis for i in range(len(perm)): if perm[i] != 4: pick.append(axes[i]) s[C].bind(pick[0], tvm.thread_axis("blockIdx.x")) s[C].bind(pick[1], tvm.thread_axis("vthread")) s[C].bind(pick[2], tvm.thread_axis("threadIdx.y")) with target: feas = feature.get_itervar_feature(s, [A, B, C]) feas = feature.flatten_itervar_feature(feas) return feas
def bitserial_dense_default(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """Bitserial dense implementation. TODO: Why are these separate Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim] weight : tvm.Tensor 2-D with shape [out_dim, in_dim] or 3-D with shape [out_dim, weight_bits, in_dim] Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim] """ data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) if len(weight.shape) == 2: weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) else: weight_packed = weight Y, DB, K = get_const_tuple(data_packed.shape) X, WB, _ = get_const_tuple(weight_packed.shape) ######## Search space x, y = cfg.axis(X), cfg.axis(Y) db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K) ko, ki = cfg.define_split('tile_k', k, policy='all', num_outputs=2) yo, yi = cfg.define_split('tile_y', y, policy='all', num_outputs=2) xo, xi = cfg.define_split('tile_x', x, policy='all', num_outputs=2) cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi], policy='candidate', candidate=[[yo, xo, ko, yi, wb, db, ki, xi], [yo, xo, yi, ko, wb, db, ki, xi]]) cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll') cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec') ###### Compute rule VX = cfg['tile_x'].size[-1] wvshape = (X // VX, WB, VX, K) oshape = (Y, X) k = tvm.reduce_axis((0, K), name='k') db = tvm.reduce_axis((0, DB), name='db') wb = tvm.reduce_axis((0, WB), name='wb') # Tile data and weights weight_vec = tvm.compute( wvshape, lambda xo, wb, vx, k: weight_packed[xo * VX + vx][wb][k], name='weight_vec') matmul_unipolar = tvm.compute( oshape, lambda i, j: tvm.sum((tvm.popcount(weight_vec[ j // VX, wb, j % VX, k] & data_packed[i, db, k]) - tvm.popcount( ~weight_vec[j // VX, wb, j % VX, k] & data_packed[i, db, k])). astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar') matmul = tvm.compute(oshape, lambda i, j: tvm.sum(tvm.popcount(weight_vec[ j // VX, wb, j % VX, k] & data_packed[ i, db, k]).astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense') # binary ops cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype)) if unipolar: return matmul_unipolar return matmul
def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): """declare the Im2Col method for conv2d""" _, CI, IH, IW = [x.value for x in data.shape] CO, _, KH, KW = [x.value for x in kernel.shape] HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride N = 1 OH = (IH + 2 * HPAD - KH) // HSTR + 1 OW = (IW + 2 * WPAD - KW) // WSTR + 1 DO_PAD = (HPAD != 0 and WPAD != 0) if DO_PAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data ALIGN = 16 def upround(x, align): return (x + align - 1) // align * align # A [CO, CI * KH * KW] reduce_len = upround(CI * KH * KW, ALIGN) A = tvm.compute( (upround(CO, ALIGN), reduce_len), lambda i, j: kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A') # B [CI * KH * KW, N * OH * OW] B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\ tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW), data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH] [j % OW*WSTR + i % KW], tvm.const(0, data_pad.dtype)), name='B') gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1] # C [CO, N * OH * OW] k = tvm.reduce_axis((0, gemm_l), name='k') C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') # output # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment, # otherwise the alignment above will be eliminated by bound inference output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\ C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1], name='output', tag='im2col_conv_output') return output
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed): """Compute declaration for winograd""" assert layout == 'NCHW' tile_size = _infer_tile_size(data, kernel) N, CI, H, W = get_const_tuple(data.shape) if not pre_computed: # kernel tensor is raw tensor, do strict check if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) CO, CI, KH, KW = get_const_tuple(kernel.shape) HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel) HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3 else: # kernel tensor is pre-transfomred. this op is created by # alter op layout, do not check # dilation is not supported HSTR = WSTR = 1 HPAD = WPAD = 1 KH = KW = 3 _, _, CI, CO = get_const_tuple(kernel.shape) data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array( [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], dtype=np.float32) B_data = np.array( [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2], [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]], np.float32) B_data = np.array( [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 H = (H + 2 * HPAD - KH) // HSTR + 1 W = (W + 2 * WPAD - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW # transform kernel if not pre_computed: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), name='r_kh') r_kw = tvm.reduce_axis((0, KW), name='r_kw') kernel_pack = tvm.compute( (alpha, alpha, CI, CO), lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps] [r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='kernel_pack') else: kernel_pack = kernel # pack input tile input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu: data_pad[p // (nH * nW)][c][ p // nW % nH * m + eps][p % nW * m + nu], name='d') # transform data B = const_matrix(B_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][ r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='data_pack') # do batch gemm ci = tvm.reduce_axis((0, CI), name='ci') bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][ ci][co] * data_pack[eps][nu][ci][p], axis=[ci]), name='bgemm') # inverse transform A = const_matrix(A_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') inverse = tvm.compute( (CO, P, m, m), lambda co, p, vh, vw: tvm.sum( bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='inverse') # output output = tvm.compute( (N, CO, H, W), lambda n, co, h, w: inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m], name='output', tag='conv2d_nchw_winograd') cfg.add_flop(2 * N * CO * H * W * CI * KH * KW) return output
def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) # Check if kernel is already bitpacked if len(kernel.shape) == 4: kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape) else: kernel_vec = kernel OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape) CO = OCO * VC IB, N, CI, H, W = get_const_tuple(data_q.shape) KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape) if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) else: TPAD, LPAD, DPAD, RPAD = padding pad_before = [0, 0, 0, TPAD, LPAD] pad_after = [0, 0, 0, DPAD, RPAD] if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH-1, KW-1 TH = H + TPAD + DPAD TW = W + LPAD + RPAD OH = (H + TPAD + DPAD - KH) // HSTR + 1 OW = (W + LPAD + RPAD - KW) // WSTR + 1 # ==================== define configuration space ==================== n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits) co, vc = cfg.define_split('tile_co', co, policy='all', num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16) oh, vh = cfg.define_split('tile_oh', oh, policy='all', num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16) ow, vw = cfg.define_split('tile_ow', ow, policy='all', num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16) cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll') re_axes = cfg.define_reorder("reorder_0", [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci], policy='interval_all', interval=(6, 11)) cfg.add_flop(2 * N * OH * OW * CO * CI * 8 * KH * KW) # these are actually binary ops # ==================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB) kvshape = (CO//VC, CI, KH, KW, KB, VC) ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC) oshape = (1, CO, OH, OW) if (TPAD != 0 and RPAD != 0): data_pad = pad(data_q, (0, 0, 0, TPAD, LPAD), (0, 0, 0, DPAD, RPAD), name="data_pad") else: data_pad = data_q data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \ data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') if len(kernel.shape) == 4: kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \ kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') b1 = tvm.reduce_axis((0, IB), name='ib') b2 = tvm.reduce_axis((0, KB), name='kb') def _conv(n, co, h, w, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if unipolar: return tvm.sum((tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype)) - tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2, axis=[ci, dh, dw, b1, b2]) return tvm.sum((tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, axis=[ci, dh, dw, b1, b2]) conv = tvm.compute(ovshape, _conv, name='conv_out') return tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], name='conv_vec', tag='spatial_bitserial_conv_nchw')
with ScheduleProcHelper(): env = nnpu.get_env() shape = (48, 48) insn_shape = (16, 16) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_n, 'a') b = tvm.placeholder(shape, dtype_n, 'b') sph = ScheduleProcHelper.current a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) k = tvm.reduce_axis((0, shape[1]), 'k') dot_shape = (shape[0], ) dot_buf = tvm.compute(dot_shape, lambda i: tvm.sum(a_buf[i, k].astype(dtype_w) * b_buf[i, k].astype(dtype_w), k), 'dot_buf') sph.MarkScope(dot_buf, 'acc') res_buf = nnpu.utils.CopyAccToBuf(dot_buf, 'res') res_host, _ = nnpu.utils.CopyBufToH(res_buf, 'res') # tensorize s = nnpu.create_schedule(res_host.op) xo, ro, xi, ri = s[dot_buf].tile(dot_buf.op.axis[0], dot_buf.op.reduce_axis[0], insn_shape[0], insn_shape[1])
def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): out_dtype = data.dtype if out_dtype is None else out_dtype assert layout == 'NCHW', "only support NCHW convolution for AVX" assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation else: dilation_h, dilation_w = dilation HPAD, WPAD = padding HSTR, WSTR = strides batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 out_height = (in_height + 2 * HPAD - dilated_kernel_h) // HSTR + 1 out_width = (in_width + 2 * WPAD - dilated_kernel_w) // WSTR + 1 # pack data DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data # fetch schedule ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width) data_vec = tvm.compute( shape, lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w], name='data_vec') # pack kernel shape = (num_filter // oc_bn, in_channel // ic_bn, kernel_height, kernel_width, ic_bn, oc_bn) kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co: kernel[ CO * oc_bn + co, CI * ic_bn + ci, h, w], name='kernel_vec') # convolution oshape = (batch_size, num_filter // oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, num_filter, out_height, out_width) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // ic_bn, oh * HSTR + kh * dilation_h, ic % ic_bn, ow * WSTR + kw * dilation_w].astype(out_dtype) * kernel_vec[ oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block].astype( out_dtype), axis=[ic, kh, kw]), name='conv') unpack = tvm.compute(unpack_shape, lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn ].astype(out_dtype), name='output_unpack', tag='conv2d_nchw') return unpack
def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): """Group convolution operator in NCHW layout. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] dilation : int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] groups : int number of groups out_dtype : str The output type. This is used for mixed precision. Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel, in_height, in_width = get_const_tuple(Input.shape) num_filter, _, kernel_h, kernel_w = get_const_tuple(Filter.shape) assert in_channel % groups == 0, "input channels must divide group size" assert num_filter % groups == 0, "output channels must divide group size" pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) # compute the output shape out_channel = num_filter out_height = simplify( (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(Input, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel // groups), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') return tvm.compute( (batch, out_channel, out_height, out_width), lambda nn, ff, yy, xx: tvm.sum( temp[nn, ff // (num_filter // groups) * (in_channel // groups) + rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(out_dtype) * Filter[ ff, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx]), tag='group_conv2d_nchw')
def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'): """Conv2D operator for nChw[x]c layout. Parameters ---------- data : tvm.Tensor 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] kernel : tvm.Tensor 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block, num_filter_block] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of two ints padding size, or [pad_height, pad_width] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] layout : str Input data layout out_layout : str Output data layout out_dtype : str output data type Returns ------- output : tvm.Tensor 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ # search platform specific declaration first # default declaration # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HPAD = pad_top + pad_down WPAD = pad_left + pad_right HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) assert (dh, dw) == (1, 1), "Does not support dilation" n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) in_channel = ic_chunk * ic_bn if data.dtype == 'uint8': oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple( kernel.shape) else: oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple( kernel.shape) num_filter = oc_chunk * oc_bn # output shape out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1 oshape = (n, oc_chunk, out_height, out_width, oc_bn) # DOPAD DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') if data.dtype == 'uint8': assert out_dtype == "int32", \ "INT8 convolution requires input dtype = uint8 and output dtype=int32" # Intel performs dot product of 2 "4" Int8 values # Current implementation requires ic_bn to be a multiple of 4 n_elems = 4 assert ic_bn % n_elems == 0 ic_outer = tvm.reduce_axis((0, in_channel // ic_bn), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, ic_bn // n_elems), name='ic_f_inner') ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner') return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum( data_pad[n, ic_outer, oh * HSTR + kh, ow * WSTR + kw, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8") # else: fp implementation return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[ n, ic // ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % ic_bn].astype( out_dtype) * kernel[oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block], axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc")
def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """ Compute convolution with pack on spatial axes. """ assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype) pack_kernel = len(kernel.shape) == 4 if pack_kernel: kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype) else: kernel_q = kernel KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape) N, H, W, CI, IB = get_const_tuple(data_q.shape) if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) else: TPAD, LPAD, DPAD, RPAD = padding pad_before = [0, TPAD, LPAD, 0, 0] pad_after = [0, DPAD, RPAD, 0, 0] if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH - 1, KW - 1 PAD_H = H + (TPAD + DPAD) PAD_W = W + (LPAD + RPAD) OH = (PAD_H - KH) // HSTR + 1 OW = (PAD_W - KW) // WSTR + 1 oshape = (1, OH, OW, CO) # ==================== define configuration space ==================== n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO) ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits) co, vc = cfg.define_split('tile_co', co, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16) oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16) ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2, filter=lambda x: max(x.size[1:]) <= 16) cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll') cfg.define_reorder("reorder_0", [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci], policy='interval_all', interval=(3, 7)) # binary ops cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype)) # ==================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] dvshape = (1, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT, VW * WSTR + WCAT, CI, IB) kvshape = (CO, KH, KW, CI, VC, KB) ovshape = (1, OH, OW, CO, VH, VW, VC) oshape = (1, OH, OW, CO) if (DPAD != 0 and RPAD != 0): data_pad = pad(data_q, pad_before, pad_after, name="data_pad") else: data_pad = data_q data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \ data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec') kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \ kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') b1 = tvm.reduce_axis((0, IB), name='ib') b2 = tvm.reduce_axis((0, KB), name='kb') def _conv(n, h, w, co, vh, vw, vc): b1b2 = (b1 + b2).astype(out_dtype) if unipolar: return tvm.sum( ((tvm.popcount( data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1] & kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) - tvm.popcount( data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1] & ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2), axis=[dh, dw, ci, b1, b2]) return tvm.sum(tvm.popcount( data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ci, b1] & kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, axis=[dh, dw, ci, b1, b2]) conv = tvm.compute(ovshape, _conv, name='conv') idxdiv = tvm.indexdiv idxmod = tvm.indexmod return tvm.compute( oshape, lambda n, h, w, co: conv[n][idxdiv(h, VH)][idxdiv(w, VW)][idxdiv( co, VC)][idxmod(h, VH)][idxmod(w, VW)][idxmod(co, VC)], name='output_unpack', tag='spatial_bitserial_conv_nhwc')
def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): """Convolution operator in NCHW layout. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [num_filter, in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel, in_height, in_width = Input.shape num_filter, channel, kernel_h, kernel_w = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = num_filter out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(Input, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') return tvm.compute((batch, out_channel, out_height, out_width), lambda nn, ff, yy, xx: tvm. sum(temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(out_dtype) * Filter[ff, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx]), tag="conv2d_nchw")
def gemm_int8(n, m, l): A = tvm.placeholder((n, l), name='A', dtype='int8') B = tvm.placeholder((m, l), name='B', dtype='int8') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype( 'int32'), axis=k), name='C') cfg = autotvm.get_config() s = tvm.create_schedule(C.op) y, x = C.op.axis AA = s.cache_read(A, 'shared', [C]) BB = s.cache_read(B, 'shared', [C]) AL = s.cache_read(AA, 'local', [C]) BL = s.cache_read(BB, 'local', [C]) CC = s.cache_write(C, 'local') k = CC.op.reduce_axis[0] cfg.define_split('tile_k', cfg.axis(k), num_outputs=3, filter=lambda entity: entity.size[2] == 4 and \ entity.size[0] * 2 >= entity.size[1]) ko, kt, ki = cfg['tile_k'].apply(s, CC, k) s[CC].tensorize(ki, intrin_dp4a) block_x = tvm.thread_axis('blockIdx.x') block_y = tvm.thread_axis('blockIdx.y') thread_x = tvm.thread_axis('threadIdx.x') thread_y = tvm.thread_axis('threadIdx.y') def block_size_filter(entity): return entity.size[0] * 2 >= entity.size[1] * 2 and \ entity.size[1] <= 16 and entity.size[3] <= 4 cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter) cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter) by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y) bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].bind(tyz, tvm.thread_axis('vthread')) s[C].bind(txz, tvm.thread_axis('vthread')) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis s[CC].reorder(ko, kt, yo, xo, ki) s[CC].unroll(kt) for stage in [AL, BL]: s[stage].compute_at(s[CC], kt) _, xi = s[stage].split(stage.op.axis[1], factor=4) s[stage].vectorize(xi) s[stage].double_buffer() cfg.define_knob('storage_align', [16, 48]) for stage in [AA, BB]: s[stage].storage_align(s[stage].op.axis[0], cfg['storage_align'].val, 0) s[stage].compute_at(s[CC], ko) fused = s[stage].fuse(*s[stage].op.axis) ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2]) tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2]) _, xi = s[stage].split(xi, factor=16) s[stage].bind(ty, thread_y) s[stage].bind(tx, thread_x) s[stage].vectorize(xi) cfg.define_knob('auto_unroll_max_step', [512, 1500]) s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[C].pragma(by, 'unroll_explicit', False) cfg.add_flop(n*m*l*2) return s, [A, B, C]
def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """Bitserial Conv2D operator. Parameters ---------- input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] filter : tvm.Tensor 4-D with shape [num_filter, in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of two or four ints padding size, [pad_height, pad_width], [pad_top, pad_left, pad_down, pad_right] activation_bits: int number of bits used for activations/input elements weight_bits: int number of bits used for weight elements out_dtype: str return type of convolution pack_dtype: str bit packing type unipolar: bool if binarization style is in unipolar 1/0 format, instead of bipolar -1/+1 format Returns ------- output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ assert isinstance(stride, int) or len(stride) == 2 Input_q = bitpack(data, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_dtype) if len(filter.shape) == 4: Filter_q = bitpack(filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_dtype) else: Filter_q = filter batch, in_channel, activation_bits, in_height, in_width = Input_q.shape num_filter, _, kernel_h, kernel_w, weight_bits = Filter_q.shape if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) else: TPAD, LPAD, DPAD, RPAD = padding pad_before = [0, 0, 0, TPAD, LPAD] pad_after = [0, 0, 0, DPAD, RPAD] PadInput_q = pad(Input_q, pad_before, pad_after, name="pad_temp") # compute the output shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride out_channel = num_filter out_height = (in_height - kernel_h + TPAD + DPAD) // stride_h + 1 out_width = (in_width - kernel_w + LPAD + RPAD) // stride_w + 1 rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') b1 = tvm.reduce_axis((0, activation_bits), name='b1') b2 = tvm.reduce_axis((0, weight_bits), name='b2') if unipolar: def _conv(nn, ff, yy, xx): b1b2 = (b1 + b2).astype(out_dtype) return tvm.sum( ((tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & Filter_q[ff, rc, ry, rx, b2]) - tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & ~Filter_q[ff, rc, ry, rx, b2])) << (b1b2)).astype(out_dtype), axis=[rc, ry, rx, b2, b1]).astype(out_dtype) else: def _conv(nn, ff, yy, xx): b1b2 = (b1 + b2).astype(out_dtype) return tvm.sum( (tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & Filter_q[ff, rc, ry, rx, b2]) << (b1b2)).astype(out_dtype), axis=[rc, ry, rx, b2, b1]).astype(out_dtype) return tvm.compute((batch, out_channel, out_height, out_width), _conv, name="Conv2dOutput", tag="bitserial_conv2d_nchw")
def _declaration_conv_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload HPAD, WPAD = padding if isinstance(padding, (tuple, list)) else (padding, padding) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) assert (dh, dw) == (1, 1), "Does not support dilation" n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) in_channel = ic_chunk * ic_bn if data.dtype == 'uint8': oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple( kernel.shape) else: oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple( kernel.shape) num_filter = oc_chunk * oc_bn if cfg.is_fallback: _get_default_config( cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype), tvm.placeholder( (num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype), strides, padding, out_dtype) # output shape out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1 oshape = (n, oc_chunk, out_height, out_width, oc_bn) # DOPAD DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') if data.dtype == 'uint8': assert out_dtype == "int32", \ "INT8 convolution requires input dtype = uint8 and output dtype=int32" # Intel performs dot product of 2 "4" Int8 values # Current implementation requires ic_bn to be a multiple of 4 n_elems = 4 assert ic_bn % n_elems == 0 ic_outer = tvm.reduce_axis((0, in_channel // ic_bn), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, ic_bn // n_elems), name='ic_f_inner') ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner') return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum( data_pad[n, ic_outer, oh * HSTR + kh, ow * WSTR + kw, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8") # else: fp implementation return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[ n, ic // ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % ic_bn].astype( out_dtype) * kernel[oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block], axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc")
import tvm import numpy as np ###################################################################### # Define Matrix Multiplication # ---------------------------- # Take matrix multiplication as our example. # Matmul first multiply the corresponding elements between two matrix, # then accumulate across a certain axis. # The following lines describe the computation :code:`A * B^T` in TVM. # N, M, L = 1024, 512, 64 A = tvm.placeholder((N, L), name='A') B = tvm.placeholder((M, L), name='B') k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k), name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # Schedule the Matmul # ------------------- # Now, suppose we have an accelerator that supports # matrix-vector multiplication (GEMV) as a hardware primitive, # which can take arbitrary size of reduce axis, # but another axis needs to be no larger than 16. # Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV. #
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (2, 2, 16) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, 'a') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) k = tvm.reduce_axis((0, 2), 'k') add_buf = tvm.compute( (2, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'add_buf') sph.MarkScope(add_buf) add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph) k1 = tvm.reduce_axis((0, 2), 'k1') mul_buf = tvm.compute( (2, 16), lambda i, j: tvm.sum(a_buf[k1, i, j], axis=k1), 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) s = tvm.create_schedule([add_host.op, mul_host.op]) sph.Transform(s) ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1) s[add_buf].reorder(ko, ki, *(s[add_buf].op.axis)) s[add_buf].tensorize(ki, env.intrins.get('MAddMerge', shape=shape, mode='w')) ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1) s[mul_buf].reorder(ko1, ki1, *(s[mul_buf].op.axis)) s[mul_buf].tensorize(ki1, env.intrins.get('MMulMerge', shape=shape, mode='w')) print(nnpu.lower(s, [a, add_host, mul_host], simple_mode=True)) func = nnpu.build(s, [a, add_host, mul_host], 'nnpu', 'llvm', name='nnpu_func') #exit() ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(2, 2, 16), dtype=a.dtype, low=-16, high=16) a_nd = tvm.nd.array(a_np, ctx) add_nd = tvm.nd.array(np.zeros((2, 16)).astype(add_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((2, 16)).astype(mul_host.dtype), ctx) func(a_nd, add_nd, mul_nd) print('a = ') print(a_np) print('reduce sum row = ') print(add_nd.asnumpy()) print('ground truth is: ') gt = np.sum(a_np, axis=0) print(gt) np.testing.assert_allclose(add_nd.asnumpy(), gt) print('reduce mul row = ') print(mul_nd.asnumpy()) gt = np.multiply.reduce(a_np, axis=0, dtype=a.dtype) print(gt) np.testing.assert_allclose(mul_nd.asnumpy(), gt)
'a = np.random.rand(M, K).astype(dtype)\n' 'b = np.random.rand(K, N).astype(dtype)\n', stmt='answer = np.dot(a, b)', number=np_repeat) print("Numpy running time: %f" % (np_runing_time / np_repeat)) # ground truth a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx) b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx) c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx) answer = np.dot(a.asnumpy(), b.asnumpy()) ################### # TVM part # Algorithm k = tvm.reduce_axis((0, K), 'k') A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k), name='C') # Default schedule s = tvm.create_schedule(C.op) func = tvm.build(s, [A, B, C], target=target, name='mmult') print(tvm.lower(s, [A, B, C], simple_mode=True)) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96): A = tvm.placeholder((n, l), name='A', dtype='float16') B = tvm.placeholder((l, m), name='B', dtype='float16') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda i, j: tvm.sum( A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k)) s = tvm.create_schedule(C.op) y, x = s[C].op.axis k = s[C].op.reduce_axis[0] AA = s.cache_read(A, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BB = s.cache_read(B, "shared", [C]) BL = s.cache_read(BB, "local", [C]) CL = s.cache_write(C, "local") bx = 4 by = 32 step_k = 8 v = 4 TX = 8 TY = 1 tile_x = bx * TX tile_y = by * TY WX = min(warp_tile_m, tile_x) tile_k = 16 vthread = 1 yo, ty = s[C].split(y, tile_y * vthread) vy, ty = s[C].split(ty, tile_y) ty, yi = s[C].split(ty, TY) xo, xi = s[C].split(x, tile_x) tz, xi = s[C].split(xi, WX) tx, xi = s[C].split(xi, TX) ko, ki = s[CL].split(k, step_k * tile_k) kl, ki = s[CL].split(ki, tile_k) s[C].reorder(yo, xo, tz, ty, tx, yi, xi) s[C].bind(yo, tvm.thread_axis("blockIdx.y")) s[C].bind(xo, tvm.thread_axis("blockIdx.x")) s[C].bind(ty, tvm.thread_axis("threadIdx.y")) s[C].bind(tz, tvm.thread_axis("threadIdx.z")) s[C].bind(tx, tvm.thread_axis("threadIdx.x")) s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy")) s[CL].compute_at(s[C], tx) yo, xo = CL.op.axis s[CL].reorder(ko, kl, ki, yo, xo) s[AA].compute_at(s[CL], ko) xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v) tz, tx = s[AA].split(xi, factor=(WX // TX) * v) tx, vec = s[AA].split(tx, factor=v) fused = s[AA].fuse(s[AA].op.axis[0], xo) _, ty = s[AA].split(fused, factor=by) s[AA].bind(ty, tvm.thread_axis("threadIdx.y")) s[AA].bind(tz, tvm.thread_axis("threadIdx.z")) s[AA].bind(tx, tvm.thread_axis("threadIdx.x")) s[AA].vectorize(vec) s[BB].compute_at(s[CL], ko) xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v) tz, tx = s[BB].split(xi, factor=(WX // TX) * v) tx, vec = s[BB].split(tx, factor=v) fused = s[BB].fuse(s[BB].op.axis[0], xo) _, ty = s[BB].split(fused, factor=by) s[BB].bind(ty, tvm.thread_axis("threadIdx.y")) s[BB].bind(tz, tvm.thread_axis("threadIdx.z")) s[BB].bind(tx, tvm.thread_axis("threadIdx.x")) s[BB].vectorize(vec) s[AL].compute_at(s[CL], kl) s[BL].compute_at(s[CL], kl) s[CL].pragma(ko, 'tensor_core') func = tvm.build(s, [A, B, C], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(l, m)).astype(B.dtype) c_np = np.zeros((n, m), dtype=np.float32) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('gemm m=%d n=%d k=%d: %f ms' % (m, n, l, evaluator(a, b, c).mean * 1e3)) c_np = np.dot(a_np, b_np) np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): out_dtype = out_dtype or data.dtype N, C, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: pre_packed = False C, M, KH, KW = get_const_tuple(kernel.shape) else: # kernel tensor is pre packed pre_packed = True C, M, KH, KW, VC = get_const_tuple(kernel.shape) C = C * VC dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 # pack data HPAD = pad_top + pad_down WPAD = pad_left + pad_right DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), name="data_pad") else: data_pad = data # fallback support # Currently, Mali schedule doesn't use it like conv2d. if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'depthwise_conv2d_nchw', 'contrib_spatial_pack') cfg.fallback_with_reference_log(ref_log) # ==================== define configuration space ==================== n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW) kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW) # Currently, Mali schedule doesn't use it like conv2d. # Leave num_tile for possible future use of Mali schedule if num_tile == 2: # for arm cpu co, vc = cfg.define_split('tile_co', c, num_outputs=2) oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) else: raise RuntimeError("Invalid num_tile") cfg.define_reorder("reorder_0", [n, co, oh, ow, kh, kw, vh, vw, vc], policy='candidate', candidate=[[n, co, oh, ow, kh, kw, vh, vw, vc], [n, co, oh, ow, kh, kw, vc, vh, vw]]) cfg.define_reorder("reorder_1", [n, co, oh, ow, vh, vw, vc], policy='candidate', candidate=[[n, co, oh, ow, vh, vw, vc], [n, co, oh, ow, vc, vh, vw], [n, co, oh, ow, vh, vc, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') # ==================================================================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] kvshape = (C // VC, M, KH, KW, VC) ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, C * M, OH, OW) if dilation_h != 1 or dilation_w != 1: # undilate input data dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW) data_vec = tvm.compute( dvshape, lambda n, h, w, c, kh, kw, vh, vw: data_pad[n][c][ (h * VH + vh) * HSTR + kh * dilation_h][ (w * VW + vw) * WSTR + kw * dilation_w], name='data_vec_undilated') else: dvshape = (N, OH // VH, OW // VW, C, VH * HSTR + KH - 1, VW * WSTR + KW - 1) data_vec = tvm.compute(dvshape, lambda n, h, w, c, vh, vw: data_pad[n][c][ h * VH * HSTR + vh][w * VW * WSTR + vw], name='data_vec') if pre_packed: kernel_vec = kernel else: kernel_vec = tvm.compute( kvshape, lambda co, m, kh, kw, vc: kernel[co * VC + vc][m][kh][kw], name='kernel_vec') kh = tvm.reduce_axis((0, KH), name='kh') kw = tvm.reduce_axis((0, KW), name='kw') if dilation_h != 1 or dilation_w != 1: conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, kh, kw, vh, vw] .astype(out_dtype) * kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype), axis=[kh, kw]), name='depthwise_conv') else: conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, vh * HSTR + kh, vw * WSTR + kw].astype(out_dtype) * kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype), axis=[kh, kw]), name='depthwise_conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co // VC][h // VH][ w // VW][h % VH][w % VW][co % VC], name='output_unpack', tag='spatial_depthwise_conv_nchw_output') return output
def measure_bandwidth_sum(total_item, item_per_thread, stride, base_type, bits, lanes, target, target_host, remote, ctx, n_times): """ measure memory bandwidth of gpu by product reduction for a given type The IR for measurement is for each thread for i in 1..num_per_thread: y[global_id] = y[global_id] * x[base + i * stride] Parameters ---------- total_item: int number of elements in input array item_per_thread: int number of elements each thread accumulates stride: int stride in memory access base_type: str can be "int", "float" bits: int can be 16, 32 lanes: int lane of the vector type, can be 1, 2, 4, 8, 16 target: :any:`tvm.target.Target` the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target ctx: TVMcontext the context of array remote: tvm.rpc.RPCSession remote rpc session n_times: int number of runs for taking mean Returns ------- GBPS: float gigabyte per second """ n, m = total_item, item_per_thread n //= lanes base_type = str(base_type) + str(bits) dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) k = tvm.reduce_axis((0, m), name="k") x = tvm.placeholder((n, ), dtype=dtype, name="x") op = tvm.comm_reducer(lambda x, y: x * y, lambda t: tvm.const(1, dtype=t), name="sum") y = tvm.compute((n // m, ), lambda i: op( x[i // stride * stride * m + i % stride + k * stride], axis=k)) s = tvm.create_schedule(y.op) yo, yi = s[y].split(y.op.axis[0], target.max_num_threads) s[y].bind(yo, tvm.thread_axis("blockIdx.x")) s[y].bind(yi, tvm.thread_axis("threadIdx.x")) s[y].unroll(k) try: func = tvm.build(s, [x, y], target, target_host=target_host) x = tvm.nd.empty((n, ), dtype=dtype, ctx=ctx) y = tvm.nd.empty((n // m, ), dtype=dtype, ctx=ctx) func = _convert_to_remote(func, remote) time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) time = time_f(x, y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) return -1 return 1.0 * (total_item * bits / 8) / 1e9 / time
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") sch = _get_schedule(wkl, "NHWC") VH = sch.vh VW = sch.vw VC = sch.vc data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) N, H, W, IB, CI = data_q.shape OCO, KH, KW, KB, VC, _ = kernel_vec.shape CO = OCO * VC HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH - 1, KW - 1 PAD_H = H + 2 * HPAD PAD_W = W + 2 * WPAD OH = (H + 2 * HPAD - KH) // HSTR + 1 OW = (W + 2 * WPAD - KW) // WSTR + 1 dvshape = (N, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT, VW * WSTR + WCAT, IB, CI) ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) oshape = (1, OH, OW, CO) if (HPAD != 0 and WPAD != 0): data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") else: data_pad = data_q data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') ib = tvm.reduce_axis((0, IB), name='ib') kb = tvm.reduce_axis((0, KB), name='kb') def _conv(n, h, w, co, vh, vw, vc): return tvm.sum( (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci].astype('uint16')) << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) conv = tvm.compute(ovshape, _conv, name='conv') return tvm.compute(oshape, lambda n, h, w, co: conv[n][h // VH][w // VW][co // VC][ h % VH][w % VW][co % VC].astype(out_dtype), name='output_vec', tag='spatial_bitserial_conv_nhwc')
def comm_reduce(data, axis=None, keepdims=False, func=tvm.sum, is_idx_reduce=False): """Reducing the data Parameters ---------- data : tvm.Tensor The input data axis : None or int or tuple of int Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. keepdims : bool If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. func : function functions like tvm.sum, tvm.max, tvm.min Returns ------- ret : tvm.Tensor """ ndim = len(data.shape) assert ndim != 0, "Reduce a dim-0 input is not supported!" real_axis = _get_real_axis(ndim, axis) reduce_axes = [ tvm.reduce_axis((0, data.shape[i]), "k%d" % i) for i in real_axis ] if keepdims: target_shape = [ 1 if i in real_axis else data.shape[i] for i in range(ndim) ] else: target_shape = [] for i in range(ndim): if i not in real_axis: target_shape.append(tvm.convert(data.shape[i])) def _compute(*indices): eval_range = [] eval_indices = [] if not keepdims: arg_counter = 0 else: arg_counter = None red_counter = 0 for i in range(len(data.shape)): if i in real_axis: eval_range.append(reduce_axes[red_counter]) eval_indices.append(reduce_axes[red_counter].var) red_counter += 1 else: if not keepdims: eval_range.append(indices[arg_counter]) arg_counter += 1 else: eval_range.append(indices[i]) if not is_idx_reduce: return func(data[tuple(eval_range)], axis=reduce_axes) idx = ravel_index(eval_indices, [data.shape[i] for i in real_axis]) return func((idx, data[tuple(eval_range)]), axis=reduce_axes) if is_idx_reduce: temp_idx, temp_val = tvm.compute(target_shape, _compute, name=data.name + "_red_temp") out = tvm.compute( target_shape, lambda *indices: _choose_idx(temp_idx, temp_val, *indices), name=data.name + "_red") else: out = tvm.compute(target_shape, _compute, name=data.name + "_red") return out
def _decl_direct(data, kernel, stride, padding, layout, out_dtype): """declare the direct method (spatial packing) for conv2d""" _, CI, IH, IW = [util.get_const_int(x) for x in data.shape] CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape] HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) HCAT, WCAT = KH - 1, KW - 1 if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride N = 1 TH = IH + 2 * HPAD TW = IW + 2 * WPAD OH = (IH + 2 * HPAD - KH) // HSTR + 1 OW = (IW + 2 * WPAD - KW) // WSTR + 1 DO_PAD = (HPAD != 0 and WPAD != 0) if DO_PAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data # set tunable parameters (tile factor, ...) tune_config = getattr(tvm.target.current_target(), "tune_config", None) if tune_config is None: VH = 1 VW, VC = 4, 4 # correct tile factor if OW % VW != 0: if OW == 14: VW = 2 VC = 8 elif OW == 7: VW = 7 else: VH = tune_config['VH'] VW = tune_config['VW'] VC = tune_config['VC'] if data.dtype == 'float16': VC *= 2 assert CO % VC == 0 assert OH % VH == 0, "OH: %d VH : %d" % (OH, VH) assert OW % VW == 0, "OW: %d VW : %d" % (OW, VW) dvshape = (N, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT, VW * WSTR + WCAT) kvshape = (CO // VC, CI, KH, KW, VC) ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, CO, OH, OW) data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: data_pad[n][ci][ h * VH * HSTR + vh][w * VW * WSTR + vw], name='data_vec') kernel_vec = tvm.compute( kvshape, lambda co, ci, kh, kw, vc: kernel[co * VC + vc][ci][kh][kw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') kh = tvm.reduce_axis((0, KH), name='kh') kw = tvm.reduce_axis((0, KW), name='kw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc:\ tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), axis=[ci, kh, kw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co // VC][h / VH][w // VW] [h % VH][w % VW][co % VC], name='output_unpack', tag='direct_conv_output') return output
def matmul(): # Algorithm k = tvm.reduce_axis((0, K), 'k') """Create a new IterVar for reduction. Parameters ---------- dom : Range The domain of iteration. name : str The name of the variable. Returns ------- axis : IterVar An iteration variable representing the value. """ A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') ##### define space begin ##### cfg = autotvm.get_config() """Get current config object Returns ------- cfg: ConfigSpace or ConfigEntity The current config """ cfg.define_split("tile_x", M, num_outputs=2) cfg.define_split("tile_y", N, num_outputs=2) cfg.define_split("tile_k", K, num_outputs=2) """Define a new tunable knob which splits an axis into a list of axes Parameters ---------- name: str name to index the entity of this space axis: tvm.schedule.IterVar axis to split policy: str name of policy. If is 'factors', the tuner will try all divisible factors. If is 'power2', the tuner will try power-of-two factors less or equal to the length. If is 'verbose', the tuner will try all candidates in above two policies. If is 'candidate', try given candidates. kwargs: dict extra arguments for policy max_factor: int the maximum split factor. filter: function(int) -> bool see examples below for how to use filter. num_outputs: int the total number of axis after split. no_tail: bool should we only include divisible numbers as split factors. candidate: list (policy=candidate) manual candidate list. Examples -------- >>> # use custom candidates >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]]) >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4 >>> cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4) """ ##### define space end ##### # We have to re-write the algorithm slightly. #print("cfg[tile_y]",cfg["tile_y"])#打印tile_y的候选空间,如[-1,128] xn = cfg["tile_x"].size[-1] bn = cfg["tile_y"].size[-1] #只打印列表里的最后一个,如上面的128 kn = cfg["tile_k"].size[-1] #print("xn:",xn,"bn:",bn,"kn:",kn) packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') """Construct a new tensor by computing over the shape domain. The compute rule is result[axis] = fcompute(axis) Parameters ---------- shape: Tuple of Expr The shape of the tensor fcompute: lambda function of indices-> value Specifies the input source expression name: str, optional The name hint of the tensor tag: str, optional Additional tag information about the compute. attrs: dict, optional The additional auxiliary attributes about the compute. Returns ------- tensor: Tensor The created tensor """ #" // " 表示整数除法,返回不大于结果的一个最大的整数 C = tvm.compute( (M, N), lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, y % bn], axis=k), name='C') s = tvm.create_schedule(C.op) """Create a schedule for list of ops Parameters ---------- ops : list of Operations The source expression. Returns ------- sch : schedule.Schedule The created schedule. """ x, y = s[C].op.axis k, = s[C].op.reduce_axis #print("x:", (x))#x: iter_var(x, range(min=0, ext=1024)) # schedule according to config # Allocate write cache CC = s.cache_write(C, 'global') ''' 在存储到tensor之前,创建原始tensor的缓存写入。这会使张量体发生变异。 在传入张量之前,将创建一个新的缓存阶段。此函数可用于支持数据布局转换。 如果在张量的数据平行轴上存在分裂/融合/重新排序在调用缓存写入之前。中间缓存存储 布局中的数据作为离开轴的迭代顺序。数据将转换回原始张量中的原始布局。用户可以进一步调用 compute_inline以内联原始布局并保持存储在转换后的布局中的数据。 Parameters ---------- tensor : Tensor, list or tuple The tensors to be feed to. All the tensors must be produced by one computeOp scope : str The scope of cached Returns ------- cache : Tensor The created cache tensor. """ ''' xo, xi = cfg["tile_x"].apply(s, C, x) yo, yi = cfg["tile_y"].apply(s, C, y) s[C].reorder(xo, yo, xi, yi) # Write cache is computed at yo s[CC].compute_at(s[C], yo) """Attach the stage at parent's scope Parameters ---------- parent : Stage The parent stage scope : IterVar The loop scope t be attached to. """ # New inner axes xc, yc = s[CC].op.axis k, = s[CC].op.reduce_axis ko, ki = cfg["tile_k"].apply(s, CC, k) s[CC].reorder(ko, xc, ki, yc) s[CC].unroll(ki) """Unroll the iteration. Parameters ---------- var : IterVar The iteration to be unrolled. """ s[CC].vectorize(yc) """Vectorize the iteration. Parameters ---------- var : IterVar The iteration to be vectorize """ # parallel s[C].parallel(xo) """Parallelize the iteration. Parameters ---------- var : IterVar The iteration to be parallelized. """ x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) return s, [A, B, C]
def test_gemm(): # graph nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute( (n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) s[AA].double_buffer() s[BB].double_buffer() ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) # lowering test s = s.normalize() # one line to build the function. def check_device(device): if not tvm.module.enabled(device): print("skip because %s is not enabled.." % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.context(device, 0) # launch the kernel. n = nn m = n l = n a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) ftimer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean print("%s: exec=%g sec/op" % (ctx, tcost)) np.testing.assert_allclose( c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("nvptx -mcpu=sm_20") check_device("rocm") check_device("metal") check_device("opencl") check_device("cuda")
def _intrin_popcount(m, k_i, w_b, x_b): dtype = 'uint8' w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w') x = tvm.placeholder(( x_b, k_i, ), dtype=dtype, name='x') k = tvm.reduce_axis((0, k_i), name='k') bw = tvm.reduce_axis((0, w_b), name='bw') bx = tvm.reduce_axis((0, x_b), name='bx') z = tvm.compute((m, ), lambda i: tvm.sum(tvm.popcount(w[bw, i, k].astype( 'uint16') & x[bx, k].astype('uint16')) << (bw + bx).astype('uint16'), axis=[bw, bx, k]), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=k_i, strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) Xb = tvm.decl_buffer(x.shape, x.dtype, name="X", offset_factor=k_i, strides=[tvm.var('ldw'), 1]) def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] vpadd = "llvm.arm.neon.vpadd.v8u8" vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') def _instr(index): irb = tvm.ir_builder.create() if index == 1: irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) return irb.get() cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload( [bx, 0], 'uint8x16') cnts = tvm.popcount(ands) upper_half = tvm.call_pure_intrin( 'uint8x8', 'vectorhigh', cnts) lower_half = tvm.call_pure_intrin( 'uint8x8', 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload( [bx, 0], 'uint8x8') cnts8[i] = tvm.popcount(ands) for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x: Xb})
def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """The default implementation of bitserial dense in topi. Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim] weight : tvm.Tensor 2-D with shape [out_dim, in_dim] or 3-D with shape [out_dim, weight_bits, in_dim] Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim] """ data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) if len(weight.shape) == 2: weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) else: weight_packed = weight Y, DB, K = get_const_tuple(data_packed.shape) X, WB, _ = get_const_tuple(weight_packed.shape) oshape = (Y, X) k = tvm.reduce_axis((0, K), name='k') db = tvm.reduce_axis((0, DB), name='db') wb = tvm.reduce_axis((0, WB), name='wb') matmul_unipolar = tvm.compute( oshape, lambda i, j: tvm.sum((tvm.popcount(weight_packed[ j, wb, k] & data_packed[i, db, k]) - tvm.popcount(~weight_packed[ j, wb, k] & data_packed[i, db, k])).astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar') matmul = tvm.compute( oshape, lambda i, j: tvm.sum(tvm.popcount(weight_packed[ j, wb, k] & data_packed[i, db, k]).astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense') if unipolar: return matmul_unipolar return matmul
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW ##### space definition begin ##### tile_bna_candidates = [1, 2, 4, 8, 16] factors = get_factors(CO) cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors]) cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16]) cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128) cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128) cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8) cfg.define_knob('yt', [1, 2, 4, 8, 16, 32]) ##### space definition end ##### if cfg.is_fallback: cfg['tile_bnb'].val = 4 cfg['tile_bna'].val = 4 while CO % cfg['tile_bna'].val != 0: cfg['tile_bna'].val //= 2 cfg['yt'].val = 8 cfg.fallback_split('tile_t1', [-1, 128]) cfg.fallback_split('tile_t2', [-1, 128]) cfg.fallback_split('c_unroll', [-1, 8]) bna = cfg['tile_bna'].val bnb = cfg['tile_bnb'].val P_round = (P + bnb - 1) // bnb * bnb assert CO % bna == 0 and P_round % bnb == 0 # pack input tile input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \ tvm.if_then_else( b * bnb + bb < P, data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps] [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d') # transform kernel if pre_computed: U = kernel else: r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute( (alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][ r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][ r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='V') # batch gemm ci = tvm.reduce_axis((0, CI), name='c') M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p: tvm.sum(U[eps][nu][co // bna][ci][ co % bna] * V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') Y = tvm.compute( (CO, P, m, m), lambda co, p, vh, vw: tvm.sum( M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='Y') # unpack output output = tvm.compute( (N, CO, H, W), lambda n, co, h, w: Y[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m] # The following hack term is used to make the padding in batch gemm ("M") # effective, otherwise the padding will be eliminated by bound inference. # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding. + tvm.expr.Mul(tvm.const(0, out_dtype), M[alpha - 1][alpha - 1][CO - 1] [P_round - 1]), name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * CO * H * W * KH * KW * CI) return output
def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): stride_h, stride_w = stride else: stride_h, stride_w = stride, stride out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) oshape = (batch, out_channel, out_height, out_width) pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(data, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') block_w = 0 block_h = 0 if stride_h == 2: if num_filter + kernel_h == 515: conv_tag = "4_4" block_h = 4 block_w = 4 else: conv_tag = "4_5" block_h = 4 block_w = 5 elif kernel_h == 3: if num_filter == 512: conv_tag = "2_7" block_h = 2 block_w = 7 else: conv_tag = "2_14" block_h = 2 block_w = 14 else: conv_tag = "1_16" block_h = 1 block_w = 16 c_h = out_height c_w = out_width if not out_height % block_h == 0: c_h = (out_height // block_h + 1) * block_h if not out_width % block_w == 0: c_w = (out_width // block_w + 1) * block_w nv = 16 cshape = (batch, out_channel // nv, c_h, c_w, nv) kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) kernel_vec = tvm.compute( kvshape, lambda co, ci, kh, kw, vc: kernel[co*nv + vc][ci][kh][kw], name='kernel_vec') conv = tvm.compute( cshape, lambda nn, ff, yy, xx, vc:\ tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), axis=[rc, ry, rx]), tag=conv_tag, name='conv') output = tvm.compute( oshape, lambda nn, ff, yy, xx: conv[nn][ff//nv][yy][xx][ff%nv], name='output_unpack', tag=conv_tag) return output
def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): """Convolution operator in NHWC layout. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] Filter : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] Returns ------- output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape kernel_h, kernel_w, channel, num_filter = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = num_filter out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') Output = tvm.compute((batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum(PaddedInput[ nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * Filter[ ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput", tag="conv2d_nhwc") return Output
def conv2d_transpose_nchw_cuda(cfg, Input, Filter, strides, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_c, in_h, in_w = get_const_tuple(Input.shape) _, out_c, filter_h, filter_w = get_const_tuple(Filter.shape) stride_h, stride_w = strides # attach stride info to config, this is used in schedule space definition cfg.stride = strides # padding stage fpad_top, fpad_left, fpad_bottom, fpad_right = nn.get_pad_tuple( padding, (filter_h, filter_w)) bpad_top = filter_h - 1 - fpad_top bpad_bottom = filter_h - 1 - fpad_bottom bpad_left = filter_w - 1 - fpad_left bpad_right = filter_w - 1 - fpad_right # padding stage FirstPad = nn.pad(Input, [ 0, 0, (bpad_top + stride_h - 1) // stride_h, (bpad_left + stride_w - 1) // stride_w ], [ 0, 0, (bpad_bottom + stride_h - 1) // stride_h, (bpad_right + stride_w - 1) // stride_w ], name='FirstPad') # remove extra padding introduced by dilatation border_h = (stride_h - bpad_top % stride_h) % stride_h border_w = (stride_w - bpad_left % stride_w) % stride_w # dilation stage data = FirstPad strides = [1, 1, stride_h, stride_w] n = len(data.shape) def _dilate(*indices): not_zero = [] index_tuple = [] for i in range(n): if not equal_const_int(strides[i], 1): index_tuple.append(indices[i] // strides[i]) not_zero.append((indices[i] % strides[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype)) return data(*index_tuple) # convolution stage out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w dc = tvm.reduce_axis((0, in_c), name='dc') dh = tvm.reduce_axis((0, filter_h), name='dh') dw = tvm.reduce_axis((0, filter_w), name='dw') Output = tvm.compute( (batch, out_c, out_h, out_w), lambda b, c, h, w: tvm.sum(_dilate( b, dc, h + dh + border_h, w + dw + border_w).astype( out_dtype) * Filter[dc, c, filter_h - 1 - dh, filter_w - 1 - dw ].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return Output