def _transform(theta, input_dim, out_size, input_shape, dtype): num_batch = input_shape[0] height = input_shape[1] width = input_shape[2] num_channels = input_shape[3] theta = topi.reshape(theta, (num_batch, 2, 3)) theta = topi.cast(theta, dtype) out_height = out_size[0] out_width = out_size[1] grid = _meshgrid(out_height, out_width) grid = topi.reshape(grid, (num_batch, 3, out_height*out_width)) grid = topi.cast(grid, dtype=dtype) k = tvm.reduce_axis((0, 3), 'k') T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g') x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s') y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s') x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,)) y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,)) input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype) output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels]) return output
def _compute(output_shape, x, y, K, trans_a, trans_b, *indices): """matmul compuation in terms of the output shape and the transposes Parameters ---------- output_shape : the final output shape, e.g., shape_x = (2, 6), shape_y = (8, 2), trans_a = True, True_b = True, then, output_shape = (6, 8). x : the first input tensor according to shape_x. y : the second input tensor according to shape_y. K : the number of the axis for sum, in the above example, K = 2. trans_a : if True, x needs to be transposed. trans_b : if True, y needs to be transposed. *indices : the output shape space for tvm.compute. Returns ------- tvm.Tensor """ N = len(output_shape) k = tvm.reduce_axis((0, K), 'k') if trans_a == True and trans_b == False: # For example, A: (6, 7, 8), B: (6, 7, 9), so the length is N = 3 # C = A' * B : (6, 8, 9), A' means the transpose of A # indices means the space of (6, 8, 9), k = 7 # x_indices = indices[:1]+(7, )+indices[1:2] = (6, 7, 8) # y_indices = indices[:1]+(7, )+indices[2:] = (6, 7, 9) x_indices = indices[:(N-2)]+(k, )+indices[(N-2):(N-1)] y_indices = indices[:(N-2)]+(k, )+indices[(N-1):] return tvm.sum(x(*x_indices)*y(*y_indices), axis=k) elif trans_a == False and trans_b == True: # For example, A: (6, 7, 8), B: (6, 9, 8), C = A * B' : (6, 7, 9) # indices means the space of (6, 7, 9), N=3, k = 8 # x_indices = indices[:2]+(8, ) = (6, 7, 8) # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 9, 8) x_indices = indices[:(N-1)]+(k, ) y_indices = indices[:(N-2)]+indices[(N-1):]+(k, ) return tvm.sum(x(*x_indices)*y(*y_indices), axis=k) elif trans_a == True and trans_b == True: # For example, A: (6, 8, 10), B: (6, 12, 8), C = A' * B' : (6, 10, 12) # indices means the space of (6, 10, 12), N=3, k = 8 # x_indices = indices[:1]+(8, )+indices[1:2] = (6, 8, 10) # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 12, 8) x_indices = indices[:(N-2)]+(k, )+indices[(N-2):(N-1)] y_indices = indices[:(N-2)]+indices[(N-1):]+(k, ) return tvm.sum(x(*x_indices)*y(*y_indices), axis=k) else: # For example, A: (6, 15, 16), B: (6, 16, 18), C = A * B : (6, 15, 18) # indices means the space of (6, 15, 18), N=3, k = 16 # x_indices = indices[:2]+(16, ) = (6, 15, 16) # y_indices = indices[:1]+(16, )+indices[2:] = (6, 16, 18) x_indices = indices[:(N-1)]+(k, ) y_indices = indices[:(N-2)]+(k, )+indices[(N-1):] return tvm.sum(x(*x_indices)*y(*y_indices), axis=k)
def _max_pool_grad_grad_with_argmax_compute( placeholders, x, argmax, grad, y, ksize, strides, padding="VALID", ori_format_x="NCHW", kernel_name="cce_max_pool_grad_grad_with_argmax"): """ Computes second-order gradients of the maxpooling function. Parameters ---------- x: dict Include info about ori_input, format, ori_format, shape, ori_shape, dtype. grad: dict Include info about grad of ori_input, format, ori_format, shape, ori_shape, dtype. argmax: dict Include info about ori_input, format, ori_format, shape, ori_shape, dtype. y: dict Include info about result of function, format, ori_format, shape, ori_shape, dtype. ksize: list or tuple The size of the window for each dimension of the input tensor. strides: list or tuple The stride of the sliding window of the input tensor. padding: str The type of padding algorithm to use. Only support "VALID" or "SAME" kernel_name: str Cce kernel name, default value is "cce_max_pool_grad_grad_with_argmax" Returns ------- grad_in_l1: process of movement of grad from gm to l1. grad_im2col: process of vm tensor of grad on l1. grad_fractal: process of fractal of grad from l1 to ub. grad_fractal_transp: process of transposition of grad. argmax_ub: process of movement of argmax from gm to ub. tensor_zero_ub: process of movement of zero tensor from gm to ub. grad_grad_col: tensor after selection. grad_grad: tensor after reduce_sum. output_res: output of the calculation. """ argmax_tensor = placeholders[1] grad_tensor = placeholders[2] (grad_n, grad_c1, grad_h, grad_w, grad_c0) = grad.get("shape") if ori_format_x == "NHWC": _, kernel_h, kernel_w, _ = ksize _, stride_h, stride_w, _ = strides else: _, _, kernel_h, kernel_w = ksize _, _, stride_h, stride_w = strides shape_max_pool_h, pad_top, pad_bottom = \ common.tf_get_windowed_output_size_verbose( grad_h, kernel_h, stride_h, padding) shape_max_pool_w, pad_left, pad_right = \ common.tf_get_windowed_output_size_verbose( grad_w, kernel_w, stride_w, padding) pad_list = (pad_top, pad_bottom, pad_left, pad_right) stride = (stride_h, stride_w) # howo must be multiple of 16 howo = _ceil_to(shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE) # copy argmax from ub to gm shape_argmax_ub = (grad_n, grad_c1 * kernel_h * kernel_w, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) argmax_ub = tvm.compute(shape_argmax_ub, lambda *i: argmax_tensor(*i), name='argmax_ub') # load3d compute shape_grad = (grad_n, grad_c1, grad_h, grad_w, grad_c0) grad_in_l1 = tvm.compute(shape_grad, lambda *i: grad_tensor[i], name="grad_in_l1") # n howo c1 kh kw c0 shape_grad_vm = (grad_n, shape_max_pool_h * shape_max_pool_w, grad_c1, kernel_h, kernel_w, grad_c0) grad_im2col = common.img2col( grad_in_l1, shape_grad_vm, kernel_h, kernel_w, pad_list, stride, ) # n hw c1 kh kw c0 -> n c1 kh kw hw c0 shape_fractal = (grad_n, howo // BLOCK_SIZE, grad_c1 * kernel_h * kernel_w, BLOCK_SIZE, BLOCK_SIZE) grad_fractal = common.im2col_fractal(shape_fractal, grad_im2col, "ca", tag='') # (n, howo/16, c1khkw, 16, c0) -> (n, c1khkw, howo/16, 16, c0) shape_grad_fratical_transp = (grad_n, grad_c1 * kernel_h * kernel_w, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) grad_fractal_transp = tvm.compute( shape_grad_fratical_transp, lambda i, j, k, l, m: grad_fractal[i, k, j, l, m], name='grad_fractal_transp') # declare a zero tensor, and move to ub for vsel dtype_tensor_zero = grad_tensor.dtype shape_tensor_zero = (BLOCK_SIZE, ) tensor_zero_ub = tvm.compute( shape_tensor_zero, lambda *i: tvm.const(0, dtype=dtype_tensor_zero), name='tensor_zero_ub') # vsel compute shape_grad_grad_col = (grad_n, grad_c1 * kernel_h * kernel_w, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) grad_grad_col = tvm.compute( shape_grad_grad_col, lambda i, j, k, l, m: tvm.select(argmax_ub[ i, j, k, l, m], grad_fractal_transp[i, j, k, l, m], tensor_zero_ub[ m]), name='grad_grad_col') # reduce_sum # (n, c1khkw, howo/16, 16, c0) -> (n, c1, howo/16, 16, c0) m = tvm.reduce_axis((0, kernel_h * kernel_w), "m") shape_grad_grad = (grad_n, grad_c1, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) grad_grad = tvm.compute( shape_grad_grad, lambda i, j, n, p, q: tvm.sum( grad_grad_col[i, j * kernel_h * kernel_w + m, n, p, q], axis=[m]), name="grad_grad") extract_params = {} extract_params["padding_mode"] = padding extract_params["shape_max_pool_h"] = shape_max_pool_h extract_params["shape_max_pool_w"] = shape_max_pool_w extract_params["fmap_shape"] = shape_grad extract_params["ksizes"] = ksize extract_params["strides"] = strides extract_params["pad"] = pad_list extract_params["fmap_vm_shape"] = shape_grad_vm extract_params["fractal_shape"] = shape_fractal extract_params["HoWo"] = howo setfmatrix_dict = { "conv_kernel_h": kernel_h, "conv_kernel_w": kernel_w, "conv_padding_top": pad_top, "conv_padding_bottom": pad_bottom, "conv_padding_left": pad_left, "conv_padding_right": pad_right, "conv_stride_h": stride_h, "conv_stride_w": stride_w, "conv_fm_c": grad_c1 * grad_c0, "conv_fm_h": grad_h, "conv_fm_w": grad_w, } # UB to OUT output_res = tvm.compute( (grad_n, grad_c1, shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE), lambda i, j, l, m: grad_grad[i, j, l // 16, l % 16, m], name="ub_to_out", attrs={ 'extract_params': extract_params, 'setfmatrix_dict': setfmatrix_dict }) return grad_in_l1, grad_im2col, grad_fractal, grad_fractal_transp, \ argmax_ub, tensor_zero_ub, grad_grad_col, grad_grad, output_res
def dynamic_lstm(input_x, weight, bias, output_h, kernel_name="dynamic_lstm"): """ x : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [FRACTAL_NZ] w : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [FRACTAL_ZN_LSTM] b : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [ND] output_h : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [FRACTAL_NZ] """ check_dtype(input_x, weight, bias, output_h) shape_x_input = input_x.get("shape") shape_w_input = weight.get("shape") shape_b_input = bias.get("shape") shape_output = output_h.get("shape") check(shape_x_input, shape_w_input, shape_b_input, shape_output) scan_one_num = 1 t_size = shape_x_input[0] + scan_one_num m_size = shape_x_input[2] k_size = shape_w_input[0] n_size = shape_w_input[1] hidden_size = shape_output[1] block_size = n_size // hidden_size in_x = k_size - hidden_size shape_b = (1, k_size, block_size, hidden_size, 16, 16) shape_c = (1, block_size, hidden_size, m_size, 16, 16) shape_bias = (1, block_size, hidden_size, 1, 1, 16) shape_x = (t_size, in_x, m_size, 16, 16) shape_h = (1, k_size - in_x, m_size, 16, 16) shape_i = (1, hidden_size, m_size, 16, 16) shape_i_t = (t_size, hidden_size, m_size, 16, 16) core_num = cce.get_soc_spec("CORE_NUM") # one core use 4 int64 that is 32B align shape_sync = (4 * core_num,) k0_size = 16 input_dtype = input_x.get("dtype") data_dtype = 'float16' sync_dtype = 'int64' # define placeholder input_x = tvm.placeholder(shape_x, dtype=input_dtype, name='input_x') weight = tvm.placeholder(shape_b, dtype=input_dtype, name='weight') bias = tvm.placeholder(shape_bias, name='bias', dtype=input_dtype) s_state_h = tvm.placeholder(shape_h, dtype=input_dtype, name='state_h') s_state_c = tvm.placeholder(shape_i, dtype=input_dtype, name='state_c') sync0 = tvm.placeholder(shape_sync, name="sync0", dtype='int64') # compute # weight need first to ub and cast to float16 weight_ub = \ tvm.compute( shape_b, lambda *indices: weight(*indices), name="weight_ub") weight_fp16 = \ tvm.compute(shape_b, lambda *indices: weight_ub(*indices).astype(data_dtype), name="weight_fp16") # input and s_state_h need first to ub and cast to float16 shape_a_z_bigz = (t_size, m_size, k_size, 16, 16) # input and s_start_h is Nz, need trans to zZ # so change axis 1 and 2 a_ub = tvm.compute(shape_a_z_bigz, lambda *indice: tvm.select(indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[3], indice[4]], s_state_h[0, indice[2] - in_x, indice[1], indice[3], indice[4]] ), name="a_ub", tag="concat") shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16) a_ub_fp16 = \ tvm.compute(shape_a_z_bigz_1, lambda *indices: a_ub(*indices).astype(data_dtype), name="a_ub_fp16") a_l1 = tvm.compute(shape_a_z_bigz_1, lambda *indices: a_ub_fp16(*indices), name='a_l1') b_l1 = tvm.compute(shape_b, lambda *indices: weight_fp16(*indices), name='b_l1') # shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16) a_l0a = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1(*indices), name="a_l0a") b_l0b = tvm.compute(shape_b, lambda *indices: b_l1(*indices), name="b_l0b") k1 = tvm.reduce_axis((0, k_size), name='k1') k0 = tvm.reduce_axis((0, k0_size), name='k0') c_l0c = tvm.compute(shape_c, lambda t, nb_0, nb_1, mb, mp, np: tvm.sum((a_l0a[t, mb, k1, mp, k0] * \ b_l0b[t, k1, nb_0, nb_1, np, k0]) \ .astype('float32'), axis=[k1, k0]), name='c_l0c') c_ub = tvm.compute(shape_c, lambda *indices: c_l0c(*indices), name="c_ub") bias_ub = tvm.compute(shape_bias, lambda *indices: bias(*indices), name='bias_ub') bias_bc_ub = te.lang.cce.broadcast(bias_ub, shape_c) c_ub_bias = te.lang.cce.vadd(c_ub, bias_bc_ub) # split matmul res i_t_index = 0 j_t_index = 1 f_t_index = 2 o_t_index = 3 i_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, i_t_index, i, j, k, l), name="i_t") j_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, j_t_index, i, j, k, l), name="j_t") f_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, f_t_index, i, j, k, l), name="f_t") o_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, o_t_index, i, j, k, l), name="o_t") f_t_sigmoid = sigmoid_compute(f_t) i_t_sigmoid = sigmoid_compute(i_t) o_t_sigmoid = sigmoid_compute(o_t) j_t_tanh = tanh_compute(j_t) c_t_tmp1 = te.lang.cce.vmul(s_state_c, f_t_sigmoid) c_t_tmp2 = te.lang.cce.vmul(j_t_tanh, i_t_sigmoid) update_c = te.lang.cce.vadd(c_t_tmp1, c_t_tmp2) update_c_gm = tvm.compute(shape_i_t, lambda t, i, j, k, l: update_c(0, i, j, k, l), name="update_c_gm") c_t_tanh = tanh_compute(update_c) update_h = te.lang.cce.vmul(c_t_tanh, o_t_sigmoid) update_h_gm = tvm.compute(shape_i_t, lambda t, i, j, k, l: update_h(0, i, j, k, l), name="update_h_gm") update_hc_vn = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_c_gm(0, i, j, k, l) +\ update_h_gm(t, i, j, k, l), name="update_hc_vn") update_c_gm_vn = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l), name="update_c_gm_vn") update_h_gm_vn = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l), name="update_h_gm_vn") update_c_ub = \ tvm.compute( shape_i, lambda t, i, j, k, l: update_c_gm_vn(t, i, j, k, l), name="update_c_ub") update_c_gm_2 = \ tvm.compute(shape_i_t, lambda t, i, j, k, l: update_c_ub(0, i, j, k, l), name="update_c_gm_2") update_h_ub = \ tvm.compute( shape_i, lambda t, i, j, k, l: update_h_gm_vn(t, i, j, k, l), name="update_h_ub") update_h_gm_2 = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_h_ub(0, i, j, k, l) +\ update_c_gm_2(t, i, j, k, l), name="update_h_gm_2") update_h_gm_2_dummy = \ tvm.compute(shape_i_t, lambda t, i, j, k, l: update_h_gm_2(t, i, j, k, l), name="update_h_gm_2_dummy") # state init init_shape = (1, hidden_size, m_size, 16, 16) s_state_h_ub = \ tvm.compute(shape_h, lambda *indices: tvm.const(0.0, dtype=input_dtype), name='s_state_h_ub') s_state_c_ub = \ tvm.compute(shape_i, lambda *indices: tvm.const(0.0, dtype=input_dtype), name='s_state_c_ub') s_init_h = \ tvm.compute( init_shape, lambda _, i, j, k, l: s_state_h_ub[0, i, j, k, l], name="s_init_h") s_init_c = \ tvm.compute( init_shape, lambda _, i, j, k, l: s_state_c_ub[0, i, j, k, l], name="s_init_c") # scan scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h_ub, update_c_ub], [s_state_h, s_state_c], scan_update=[update_h_gm_2, update_h_gm_2_dummy], name="lstm_scan") # end compute # schedule s = tvm.create_schedule([scan_h.op, scan_c.op]) new_build_list = [input_x, weight, bias, update_h_gm, update_c_gm, sync0, update_h_gm_vn, update_c_gm_vn] def gen_reversed_subgraph_list(out_tensor, tensor_list): """ traverse tensors by Depth-First-Search """ if out_tensor is None: return stack = [out_tensor] visited_list = [] while stack: cur_tensor = stack.pop() visited_list.append(cur_tensor) for in_tensor in cur_tensor.op.input_tensors: if in_tensor not in visited_list: stack.append(in_tensor) if "elewise" in in_tensor.op.tag or \ "broadcast" == in_tensor.op.tag: if in_tensor not in tensor_list: tensor_list.append(in_tensor) elewise_tensors = [] gen_reversed_subgraph_list(update_h_gm, elewise_tensors) barrier_tensor = c_ub_bias elewise_before_barrier_tensors = [bias_bc_ub] # set scope s[a_l1].set_scope(cce.scope_cbuf) s[b_l1].set_scope(cce.scope_cbuf) s[a_l0a].set_scope(cce.scope_ca) s[b_l0b].set_scope(cce.scope_cb) s[c_l0c].set_scope(cce.scope_cc) s[c_ub].set_scope(cce.scope_ubuf) s[s_init_h].set_scope(cce.scope_ubuf) s[bias_ub].set_scope(cce.scope_ubuf) s[bias_bc_ub].set_scope(cce.scope_ubuf) s[scan_h].set_scope(cce.scope_ubuf) s[scan_c].set_scope(cce.scope_ubuf) s[update_h_ub].set_scope(cce.scope_ubuf) s[update_c_ub].set_scope(cce.scope_ubuf) s[s_state_h_ub].set_scope(cce.scope_ubuf) s[s_state_c_ub].set_scope(cce.scope_ubuf) s[weight_ub].set_scope(cce.scope_ubuf) s[weight_fp16].set_scope(cce.scope_ubuf) s[a_ub].set_scope(cce.scope_ubuf) s[a_ub_fp16].set_scope(cce.scope_ubuf) for tensor in elewise_tensors: s[tensor].set_scope(cce.scope_ubuf) # compute inline compute_inline_tensors = [i_t, j_t, f_t, o_t] for tensor in compute_inline_tensors: s[tensor].compute_inline() # matmul tiling factor_l1_m, factor_l1_n, factor_l1_k, \ factor_l0_m, factor_l0_n, factor_l0_k = \ _get_lstm_tiling(m_size, k_size, n_size) l1_n_outer, l1_n_inner = \ s[c_l0c].split(c_l0c.op.axis[2], factor=factor_l1_n // block_size) l1_m_outer, l1_m_inner = \ s[c_l0c].split(c_l0c.op.axis[3], factor=factor_l1_m) l1_k_outer, l1_k_inner = \ s[c_l0c].split(c_l0c.op.reduce_axis[0], factor=factor_l1_k) l0_n_outer, l0_n_inner = s[c_l0c].split(l1_n_inner, factor=factor_l0_n) l0_m_outer, l0_m_inner = s[c_l0c].split(l1_m_inner, factor=factor_l0_m) l0_k_outer, l0_k_inner = s[c_l0c].split(l1_k_inner, factor=factor_l0_k) s[c_l0c].reorder(l1_n_outer, c_l0c.op.axis[1], l1_m_outer, l1_k_outer, l0_n_outer, l0_m_outer, l0_k_outer, l0_n_inner, l0_m_inner, c_l0c.op.axis[3 + 1], c_l0c.op.axis[4 + 1], l0_k_inner, c_l0c.op.reduce_axis[1]) s[weight_ub].compute_at(s[c_l0c], l1_k_outer) s[weight_fp16].compute_at(s[c_l0c], l1_k_outer) s[a_ub].compute_at(s[c_l0c], l1_k_outer) s[a_ub_fp16].compute_at(s[c_l0c], l1_k_outer) s[a_l0a].compute_at(s[c_l0c], l0_k_outer) s[b_l0b].compute_at(s[c_l0c], l0_k_outer) s[a_l1].compute_at(s[c_l0c], l1_k_outer) s[b_l1].compute_at(s[c_l0c], l1_k_outer) ub_n_outer, ub_n_inner = \ s[c_ub].split(c_ub.op.axis[2], factor=factor_l1_n // block_size) ub_m_outer, ub_m_inner = s[c_ub].split(c_ub.op.axis[3], factor=factor_l1_m) s[c_ub].reorder(ub_n_outer, c_ub.op.axis[1], ub_m_outer, ub_n_inner, ub_m_inner, c_ub.op.axis[4], c_ub.op.axis[5]) s[c_l0c].compute_at(s[c_ub], ub_n_outer) # elewise compute_at barrier_outer, barrier_inner = \ s[barrier_tensor].split(barrier_tensor.op.axis[2], factor=factor_l1_n // block_size) s[barrier_tensor].reorder( barrier_tensor.op.axis[0], barrier_outer, barrier_tensor.op.axis[1], barrier_inner, barrier_tensor.op.axis[3], barrier_tensor.op.axis[4], barrier_tensor.op.axis[5]) s[c_ub].compute_at(s[barrier_tensor], barrier_outer) s[bias_ub].compute_at(s[barrier_tensor], barrier_outer) for tensor in elewise_before_barrier_tensors: s[tensor].compute_at(s[barrier_tensor], barrier_outer) vn_outer, vn_inner = \ s[update_hc_vn].split(update_hc_vn.op.axis[0 + 1], factor=factor_l1_n // block_size) second_split_factor = \ (hidden_size // (factor_l1_n // block_size)) // core_num vn_o_outer, vn_o_inner = \ s[update_hc_vn].split(vn_outer, factor=second_split_factor) s[barrier_tensor].compute_at(s[update_hc_vn], vn_o_inner) for tensor in elewise_tensors: if tensor not in elewise_before_barrier_tensors: s[tensor].compute_at(s[update_hc_vn], vn_o_inner) s[update_c_gm].compute_at(s[update_hc_vn], vn_o_inner) s[update_h_gm].compute_at(s[update_hc_vn], vn_o_inner) second_split_factor = hidden_size // core_num res_h_outer, res_h_inner = \ s[update_h_gm_2].split(update_h_gm_2.op.axis[1], factor=hidden_size) s[update_hc_vn].compute_at(s[update_h_gm_2], update_h_gm_2.op.axis[0]) s[update_c_gm_vn].compute_at(s[update_h_gm_2], res_h_outer) s[update_h_gm_vn].compute_at(s[update_h_gm_2], res_h_outer) s[update_c_ub].compute_at(s[update_h_gm_2], res_h_outer) s[update_c_gm_2].compute_at(s[update_h_gm_2], res_h_outer) s[update_h_ub].compute_at(s[update_h_gm_2], res_h_outer) s[update_h_gm_vn].bind_buffer( update_h_gm_vn.op.axis[0], 0, scan_h.op.scan_axis + res_h_outer) s[update_c_gm_vn].bind_buffer( update_c_gm_vn.op.axis[0], 0, scan_h.op.scan_axis + res_h_outer) # bind s[update_hc_vn].bind(vn_o_outer, tvm.thread_axis("blockIdx.x")) # multi core sync s[update_hc_vn].pragma(update_hc_vn.op.axis[0], pragma_type="multicore_sync_wait_after", pragma_value=sync0[0]) s[update_hc_vn].pragma(update_hc_vn.op.axis[0], pragma_type="multicore_sync_set_after", pragma_value=sync0[0]) # modify for extend s[input_x].bind_buffer(0, 0, scan_h.op.scan_axis) s[update_h_gm].buffer_tile((scan_h.op.scan_axis*1, 1), (None, None), (None, None), (None, None), (None, None)) s[update_c_gm].buffer_tile((scan_h.op.scan_axis*1, 1), (None, None), (None, None), (None, None), (None, None)) s[update_h_gm_2].buffer_tile((0, 1), (None, None), (None, None), (None, None), (None, None)) s[update_c_gm_2].buffer_tile((0, 1), (None, None), (None, None), (None, None), (None, None)) # buffer reuse s[update_h_gm].reused_by(update_h_gm_vn) s[update_c_gm].reused_by(update_c_gm_vn) # emit_insn s[a_l1].emit_insn(a_l1.op.axis[0], 'dma_copy') s[b_l1].emit_insn(b_l1.op.axis[0], 'dma_copy') s[a_l0a].emit_insn(a_l0a.op.axis[0], 'dma_copy') s[b_l0b].emit_insn(b_l0b.op.axis[0], 'dma_copy') s[weight_ub].emit_insn(weight_ub.op.axis[0], 'dma_copy') s[weight_fp16].emit_insn(weight_fp16.op.axis[0], 'vector_conv') s[a_ub].emit_insn(a_ub.op.axis[0], 'dma_copy') s[a_ub_fp16].emit_insn(a_ub_fp16.op.axis[0], 'vector_conv') mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer, l0_k_outer]} s[c_l0c].emit_insn(l0_n_inner, 'mad', mad_dict) s[c_ub].emit_insn(ub_n_inner, 'dma_copy') s[s_init_h].emit_insn(s_init_h.op.axis[0], 'dma_copy') s[s_init_c].emit_insn(s_init_c.op.axis[0], 'dma_copy') s[bias_bc_ub].emit_insn(bias_bc_ub.op.axis[0], 'unified_broadcast') s[s_state_h_ub].emit_insn(s_state_h_ub.op.axis[0], 'broadcast') s[s_state_c_ub].emit_insn(s_state_c_ub.op.axis[0], 'broadcast') s[barrier_tensor].emit_insn(barrier_tensor.op.axis[1], 'vector_add') for tensor in elewise_tensors: if tensor != barrier_tensor: insn = get_emit_insn_map(tensor) s[tensor].emit_insn(tensor.op.axis[0], insn) s[bias_ub].emit_insn(bias_ub.op.axis[0], 'dma_copy') s[update_c_gm].emit_insn(s[update_c_gm].op.axis[1], 'dma_copy') s[update_h_gm].emit_insn(s[update_h_gm].op.axis[1], 'dma_copy') s[update_c_ub].emit_insn(update_c_ub.op.axis[1], 'dma_copy') s[update_h_ub].emit_insn(update_h_ub.op.axis[1], 'dma_copy') s[update_hc_vn].emit_insn(vn_inner, 'phony_insn') s[update_c_gm_vn].emit_insn(s[update_c_gm_vn].op.axis[0], 'phony_insn') s[update_h_gm_vn].emit_insn(s[update_h_gm_vn].op.axis[0], 'phony_insn') s[update_h_gm_2].emit_insn(res_h_inner, 'phony_insn') s[update_c_gm_2].emit_insn(s[update_c_gm_2].op.axis[0], 'phony_insn') s[update_h_gm_2_dummy].emit_insn( update_h_gm_2_dummy.op.axis[0], 'phony_insn') def _write_workspace_info(shape_list, dtype_list, sync_num, kernel_name): """ modify json after build """ def _write_code(wkspace_dict, fname): fname = os.path.realpath(fname) if fname.startswith(os.getcwd()): if os.path.exists(fname): with open(fname, "r") as f: load_dict = json.load(f) load_dict.update(wkspace_dict) with open(fname, "w") as f: json.dump(load_dict, f, sort_keys=True, indent=4, separators=(',', ':')) def _get_data_width(ele): """ get data width """ m_sea = re.search(r'\d+', ele) if m_sea: return int(m_sea.group(0)) // 8 return 0 if not os.path.exists("kernel_meta"): os.mkdir("kernel_meta") os.chmod("kernel_meta", stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP) num = len(shape_list) wkspace_dict = {} if num: total_size = [functools_reduce(lambda x, y: x * y, list_i) for list_i in shape_list] addr_type_list = [] for i, element in enumerate(dtype_list): total_size[i] = total_size[i] * _get_data_width(element) addr_type_list.append(0) if not os.path.exists("kernel_meta"): os.mkdir("kernel_meta") os.chmod("kernel_meta", stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP) wkspace_dict["workspace"] = {"num": num, "size": total_size, "type": addr_type_list} if sync_num: parameters_list = \ (len(new_build_list) - 2 - sync_num) * [0, ] + sync_num * [1, ] wkspace_dict["parameters"] = parameters_list if wkspace_dict: _write_code(wkspace_dict, "kernel_meta/" + kernel_name + ".json") with build_config: tvm.build(s, new_build_list, "cce", name=kernel_name) _write_workspace_info( [shape_i_t, shape_sync], [input_dtype, sync_dtype], 1, kernel_name)
def basic_rnn_cell_compute(self): """ calculating data Parameters ---------- input_x : TVM tensor the placeholder of input_x output_y : dict dict of output_y, include keys(shape and dtype) kernel_name : str kernel name, default value is "basicrnn_cell" Returns ------- output tensor """ matmul_res_shape = (self.dims["hidden_dim"], self.dims["batch_dim"], 16, 16) # Tensor x from GM to L1, L0A l1_x = tvm.compute( (self.dims["batch_dim"], self.dims["input_dim"], 16, 16), lambda i0, i1, i2, i3: self.datas["x"][i1, i0, i2, i3], name='l1_x') self.tensor_list1["l1_x"] = l1_x self.emit_cmd["l1_x"] = "dma_copy" self.scope_list["l1_x"] = cce.scope_cbuf l0a_x = tvm.compute(l1_x.shape, lambda *i: l1_x(*i), name='l0a_x') self.tensor_list1["l0a_x"] = l0a_x self.emit_cmd["l0a_x"] = "dma_copy" self.scope_list["l0a_x"] = cce.scope_ca # Tensor w_xh from GM to L1, L0B l1_w_xh = tvm.compute(self.datas["w_xh"].shape, lambda *i: self.datas["w_xh"](*i), name='l1_w_xh') self.tensor_list1["l1_w_xh"] = l1_w_xh self.emit_cmd["l1_w_xh"] = "dma_copy" self.scope_list["l1_w_xh"] = cce.scope_cbuf l0b_w_xh = tvm.compute(l1_w_xh.shape, lambda *i: l1_w_xh(*i), name='l0b_w_xh') self.tensor_list1["l0b_w_xh"] = l0b_w_xh self.emit_cmd["l0b_w_xh"] = "dma_copy" self.scope_list["l0b_w_xh"] = cce.scope_cb # Copy bias from GM to UB ub_bias_h = tvm.compute(self.datas["bias_h"].shape, lambda *i: self.datas["bias_h"](*i), name='ub_bias_h') self.tensor_list1["ub_bias_h"] = ub_bias_h self.emit_cmd["ub_bias_h"] = "dma_copy" self.scope_list["ub_bias_h"] = cce.scope_ubuf if ub_bias_h.dtype == "float16" and self.device != "hisi_es": l0c_bias_h = tvm.compute( matmul_res_shape, lambda i0, i1, i2, i3: ub_bias_h[i0, i3].astype("float32"), name='l0c_bias_h') else: l0c_bias_h = tvm.compute(matmul_res_shape, lambda i0, i1, i2, i3: ub_bias_h[i0, i3], name='l0c_bias_h') self.tensor_list1["l0c_bias_h"] = l0c_bias_h self.emit_cmd["l0c_bias_h"] = "dma_copy" self.scope_list["l0c_bias_h"] = cce.scope_cc reduce_kb = tvm.reduce_axis((0, self.dims["input_dim"]), name='reduce_kb') reduce_kp = tvm.reduce_axis((0, 16), name='reduce_kp') if self.device == "hisi_es": l0c_wht_xt = tvm.compute( matmul_res_shape, lambda nb, mb, mp, np: tvm.sum( (l0a_x[mb, reduce_kb, mp, reduce_kp] * l0b_w_xh[ reduce_kb, nb, np, reduce_kp]), axis=[reduce_kb, reduce_kp]), name='l0c_wht_xt', attrs={'input_order': 'positive'}) else: l0c_wht_xt = tvm.compute( matmul_res_shape, lambda nb, mb, mp, np: tvm.sum( (l0a_x[mb, reduce_kb, mp, reduce_kp] * l0b_w_xh[ reduce_kb, nb, np, reduce_kp]).astype("float32"), axis=[reduce_kb, reduce_kp]), name='l0c_wht_xt', attrs={'input_order': 'positive'}) self.tensor_list1["l0c_wht_xt"] = l0c_wht_xt self.scope_list["l0c_wht_xt"] = cce.scope_cc # Matmul accumulation wht_xt + bias_h l0c_wht_xt_bias_h = tvm.compute( matmul_res_shape, lambda *i: l0c_bias_h(*i) + l0c_wht_xt(*i), name="l0c_wht_xt_bias_h") self.tensor_list1["l0c_wht_xt_bias_h"] = l0c_wht_xt_bias_h self.emit_cmd["l0c_wht_xt_bias_h"] = "phony_insn" self.scope_list["l0c_wht_xt_bias_h"] = cce.scope_cc # Move ht to UB ub_wht_xt_bias_h = tvm.compute(matmul_res_shape, lambda *i: l0c_wht_xt_bias_h(*i), name='ub_wht_xt_bias_h') self.tensor_list1["ub_wht_xt_bias_h"] = ub_wht_xt_bias_h self.emit_cmd["ub_wht_xt_bias_h"] = "dma_copy" self.scope_list["ub_wht_xt_bias_h"] = cce.scope_ubuf if self.expose_hidden: ub_ht_tmp1 = self.compute_h_0_whh(ub_wht_xt_bias_h) else: ub_ht_tmp1 = ub_wht_xt_bias_h if self.has_static: # Copy bias from GM to UB ub_w_xh_x_static = tvm.compute( matmul_res_shape, lambda *i: self.datas["w_xh_x_static"](*i), name='ub_w_xh_x_static') self.tensor_list1["ub_w_xh_x_static"] = ub_w_xh_x_static self.emit_cmd["ub_w_xh_x_static"] = "dma_copy" self.scope_list["ub_w_xh_x_static"] = cce.scope_ubuf if ub_w_xh_x_static.dtype == "float16" \ and self.device != "hisi_es": ub_w_xh_x_static_fp32 = tvm.compute( ub_w_xh_x_static.shape, lambda *i: topi.cast(ub_w_xh_x_static(*i), "float32"), name="ub_w_xh_x_static_fp32") self.tensor_list1[ "ub_w_xh_x_static_fp32"] = ub_w_xh_x_static_fp32 self.emit_cmd["ub_w_xh_x_static_fp32"] = "vector_conv" self.scope_list["ub_w_xh_x_static_fp32"] = cce.scope_ubuf else: ub_w_xh_x_static_fp32 = ub_w_xh_x_static ub_ht_tmp2 = tvm.compute( matmul_res_shape, lambda *i: ub_ht_tmp1(*i) + ub_w_xh_x_static_fp32(*i), name="ub_ht_tmp2") self.tensor_list1["ub_ht_tmp2"] = ub_ht_tmp2 self.emit_cmd["ub_ht_tmp2"] = "vector_add" self.scope_list["ub_ht_tmp2"] = cce.scope_ubuf else: ub_ht_tmp2 = ub_ht_tmp1 tanh_ht_tensor, ht_tanh_op, ht_tanh_scope = \ tanh_compute(ub_ht_tmp2.shape, ub_ht_tmp2, "ht", self.impl_mode) if self.dtypes["h_t"] == "float16" and self.device != "hisi_es": ub_ht_fp16 = tvm.compute( matmul_res_shape, lambda *i: topi.cast(tanh_ht_tensor["ub_tanh_ht"] (*i), "float16"), name='ub_ht_fp16') tanh_ht_tensor["ub_ht_fp16"] = ub_ht_fp16 ht_tanh_op["ub_ht_fp16"] = "vector_conv" ht_tanh_scope["ub_ht_fp16"] = cce.scope_ubuf ub_ht = ub_ht_fp16 else: ub_ht = tanh_ht_tensor["ub_tanh_ht"] self.tanh_ht_tensor = tanh_ht_tensor self.scope_list.update(ht_tanh_scope) self.tensor_list1.update(tanh_ht_tensor) self.emit_cmd.update(ht_tanh_op) gm_ht = tvm.compute(matmul_res_shape, lambda *i: ub_ht(*i), name='gm_ht') self.tensor_list1["gm_ht"] = gm_ht self.scope_list["gm_ht"] = cce.scope_gm # Tensor ht from GM to L1, L0A if gm_ht.dtype == "float32": ub_ht_new = tvm.compute(matmul_res_shape, lambda *i: gm_ht(*i), name='ub_ht_new') self.tensor_list2["ub_ht_new"] = ub_ht_new self.emit_cmd["ub_ht_new"] = "dma_copy" self.scope_list["ub_ht_new"] = cce.scope_ubuf ub_ht_fp16 = tvm.compute( ub_ht_new.shape, lambda *i: topi.cast(ub_ht_new(*i), "float16"), name="ub_ht_fp16") self.tensor_list2["ub_ht_fp16"] = ub_ht_fp16 self.emit_cmd["ub_ht_fp16"] = "vector_conv" self.scope_list["ub_ht_fp16"] = cce.scope_ubuf else: ub_ht_fp16 = gm_ht l1_ht = tvm.compute( (self.dims["batch_dim"], self.dims["hidden_dim"], 16, 16), lambda i0, i1, i2, i3: ub_ht_fp16[i1, i0, i2, i3], name='l1_ht') self.tensor_list2["l1_ht"] = l1_ht self.emit_cmd["l1_ht"] = "dma_copy" self.scope_list["l1_ht"] = cce.scope_cbuf l0a_ht = tvm.compute(l1_ht.shape, lambda *i: l1_ht(*i), name='l0a_ht') self.tensor_list2["l0a_ht"] = l0a_ht self.emit_cmd["l0a_ht"] = "dma_copy" self.scope_list["l0a_ht"] = cce.scope_ca # Tensor w_ho from ub to L1, L0B l1_w_ho = tvm.compute(self.datas["w_ho"].shape, lambda *i: self.datas["w_ho"](*i), name='l1_w_ho') self.tensor_list2["l1_w_ho"] = l1_w_ho self.emit_cmd["l1_w_ho"] = "dma_copy" self.scope_list["l1_w_ho"] = cce.scope_cbuf l0b_w_ho = tvm.compute(l1_w_ho.shape, lambda *i: l1_w_ho(*i), name='l0b_w_ho') self.tensor_list2["l0b_w_ho"] = l0b_w_ho self.emit_cmd["l0b_w_ho"] = "dma_copy" self.scope_list["l0b_w_ho"] = cce.scope_cb # Copy bias from GM to UB ub_bias_o = tvm.compute(self.datas["bias_o"].shape, lambda *i: self.datas["bias_o"](*i), name='ub_bias_o') self.tensor_list2["ub_bias_o"] = ub_bias_o self.emit_cmd["ub_bias_o"] = "dma_copy" self.scope_list["ub_bias_o"] = cce.scope_ubuf if ub_bias_o.dtype == "float16" and self.device != "hisi_es": l0c_bias_o = tvm.compute( matmul_res_shape, lambda i0, i1, i2, i3: ub_bias_o[i0, i3].astype("float32"), name='l0c_bias_o') else: l0c_bias_o = tvm.compute(matmul_res_shape, lambda i0, i1, i2, i3: ub_bias_o[i0, i3], name='l0c_bias_o') self.tensor_list2["l0c_bias_o"] = l0c_bias_o self.emit_cmd["l0c_bias_o"] = "dma_copy" self.scope_list["l0c_bias_o"] = cce.scope_cc reduce_kb = tvm.reduce_axis((0, self.dims["hidden_dim"]), name='reduce_kb') reduce_kp = tvm.reduce_axis((0, 16), name='reduce_kp') if self.device == "hisi_es": l0c_who_ht = tvm.compute( matmul_res_shape, lambda nb, mb, mp, np: tvm.sum( (l0a_ht[mb, reduce_kb, mp, reduce_kp] * l0b_w_ho[ reduce_kb, nb, np, reduce_kp]), axis=[reduce_kb, reduce_kp]), name='l0c_who_ht', attrs={'input_order': 'positive'}) else: l0c_who_ht = tvm.compute( matmul_res_shape, lambda nb, mb, mp, np: tvm.sum( (l0a_ht[mb, reduce_kb, mp, reduce_kp] * l0b_w_ho[ reduce_kb, nb, np, reduce_kp]).astype("float32"), axis=[reduce_kb, reduce_kp]), name='l0c_who_ht', attrs={'input_order': 'positive'}) self.tensor_list2["l0c_who_ht"] = l0c_who_ht self.scope_list["l0c_who_ht"] = cce.scope_cc # Matmul accumulation whh_ht + bias_o l0c_who_ht_bias_o = tvm.compute( matmul_res_shape, lambda *i: l0c_bias_o(*i) + l0c_who_ht(*i), name="l0c_who_ht_bias_o") self.tensor_list2["l0c_who_ht_bias_o"] = l0c_who_ht_bias_o self.emit_cmd["l0c_who_ht_bias_o"] = "phony_insn" self.scope_list["l0c_who_ht_bias_o"] = cce.scope_cc # Move ub_whh_ht_bias_o to UB ub_who_ht_bias_o = tvm.compute(matmul_res_shape, lambda *i: l0c_who_ht_bias_o(*i), name='ub_who_ht_bias_o') self.tensor_list2["ub_who_ht_bias_o"] = ub_who_ht_bias_o self.emit_cmd["ub_who_ht_bias_o"] = "dma_copy" self.scope_list["ub_who_ht_bias_o"] = cce.scope_ubuf tanh_ot_tensor, tanh_ot_operator, tanh_ot_scope = \ tanh_compute(ub_who_ht_bias_o.shape, ub_who_ht_bias_o, "ot", self.impl_mode) if self.dtypes["o_t"] == "float16" and self.device != "hisi_es": ub_ot_fp16 = tvm.compute( matmul_res_shape, lambda *i: topi.cast(tanh_ot_tensor["ub_tanh_ot"] (*i), "float16"), name='ub_ot_fp16') tanh_ot_tensor["ub_ot_fp16"] = ub_ot_fp16 tanh_ot_operator["ub_ot_fp16"] = "vector_conv" tanh_ot_scope["ub_ot_fp16"] = cce.scope_ubuf ub_ot = ub_ot_fp16 else: ub_ot = tanh_ot_tensor["ub_tanh_ot"] self.tanh_ot_tensor = tanh_ot_tensor self.scope_list.update(tanh_ot_scope) self.tensor_list2.update(tanh_ot_tensor) self.emit_cmd.update(tanh_ot_operator) gm_ot = tvm.compute(matmul_res_shape, lambda *i: ub_ot(*i), name='gm_ot') self.tensor_list2["gm_ot"] = gm_ot self.emit_cmd["gm_ot"] = "dma_copy" self.scope_list["gm_ot"] = cce.scope_gm res_empty = tvm.compute(matmul_res_shape, lambda *i: gm_ot(*i) * gm_ht(*i), name='res_empty') self.tensor_list2["res_empty"] = res_empty self.emit_cmd["res_empty"] = "phony_insn" self.scope_list["res_empty"] = cce.scope_ubuf schedule_list = [res_empty.op] sch = self.basic_rnn_cell_schedule(schedule_list) if self.has_static: build_list = (self.datas["x"], self.datas["cont"], self.datas["w_xh_x_static"], self.datas["h_0"], self.datas["w_xh"], self.datas["bias_h"], self.datas["w_hh"], self.datas["w_ho"], self.datas["bias_o"], gm_ot, gm_ht) else: if self.expose_hidden: build_list = (self.datas["x"], self.datas["cont"], self.datas["h_0"], self.datas["w_xh"], self.datas["bias_h"], self.datas["w_hh"], self.datas["w_ho"], self.datas["bias_o"], gm_ot, gm_ht) else: build_list = (self.datas["x"], self.datas["w_xh"], self.datas["bias_h"], self.datas["w_ho"], self.datas["bias_o"], gm_ot, gm_ht) with build_config: tvm.build(sch, build_list, "cce", name=self.kernel_name)
def compute_h_0_whh(self, wht_xt_bias_h): """ calculating h_0_whh Parameters ---------- wht_xt_bias_h : TVM tensor Returns ------- output tensor """ matmul_res_shape = (self.dims["hidden_dim"], self.dims["batch_dim"], 16, 16) # Tensor h_0 from GM to L1, L0A h_0_fp16 = self.datas["h_0"] if self.dtypes["h_0"] == "float32": ub_h_0 = tvm.compute( (self.dims["hidden_dim"], self.dims["batch_dim"], 16, 16), lambda *i: self.datas["h_0"](*i), name='ub_h_0') self.tensor_list1["ub_h_0"] = ub_h_0 self.emit_cmd["ub_h_0"] = "dma_copy" self.scope_list["ub_h_0"] = cce.scope_ubuf h_0_fp16 = tvm.compute(ub_h_0.shape, lambda *i: topi.cast(ub_h_0(*i), "float16"), name="h_0_fp16") self.tensor_list1["h_0_fp16"] = h_0_fp16 self.emit_cmd["h_0_fp16"] = "vector_conv" self.scope_list["h_0_fp16"] = cce.scope_ubuf l1_h_0 = tvm.compute( (self.dims["batch_dim"], self.dims["hidden_dim"], 16, 16), lambda i0, i1, i2, i3: h_0_fp16[i1, i0, i2, i3], name='l1_h_0') self.tensor_list1["l1_h_0"] = l1_h_0 self.emit_cmd["l1_h_0"] = "dma_copy" self.scope_list["l1_h_0"] = cce.scope_cbuf l0a_h_0 = tvm.compute(l1_h_0.shape, lambda *i: l1_h_0(*i), name='l0a_w_hh') self.tensor_list1["l0a_h_0"] = l0a_h_0 self.emit_cmd["l0a_h_0"] = "dma_copy" self.scope_list["l0a_h_0"] = cce.scope_ca # Tensor w_hh from GM to L1, L0B l1_w_hh = tvm.compute(self.datas["w_hh"].shape, lambda *i: self.datas["w_hh"](*i), name='l1_w_hh') self.tensor_list1["l1_w_hh"] = l1_w_hh self.emit_cmd["l1_w_hh"] = "dma_copy" self.scope_list["l1_w_hh"] = cce.scope_cbuf l0b_w_hh = tvm.compute(l1_w_hh.shape, lambda *i: l1_w_hh(*i), name='l0b_h_0') self.tensor_list1["l0b_w_hh"] = l0b_w_hh self.emit_cmd["l0b_w_hh"] = "dma_copy" self.scope_list["l0b_w_hh"] = cce.scope_cb reduce_kb = tvm.reduce_axis((0, self.dims["hidden_dim"]), name='reduce_kb') reduce_kp = tvm.reduce_axis((0, 16), name='reduce_kp') if self.device == "hisi_es": l0c_whh_ht = tvm.compute( matmul_res_shape, lambda nb, mb, mp, np: tvm.sum( (l0a_h_0[mb, reduce_kb, mp, reduce_kp] * l0b_w_hh[ reduce_kb, nb, np, reduce_kp]), axis=[reduce_kb, reduce_kp]), name='l0c_whh_ht', attrs={'input_order': 'positive'}) else: l0c_whh_ht = tvm.compute( matmul_res_shape, lambda nb, mb, mp, np: tvm.sum( (l0a_h_0[mb, reduce_kb, mp, reduce_kp] * l0b_w_hh[ reduce_kb, nb, np, reduce_kp]).astype("float32"), axis=[reduce_kb, reduce_kp]), name='l0c_whh_ht', attrs={'input_order': 'positive'}) self.tensor_list1["l0c_whh_ht"] = l0c_whh_ht self.scope_list["l0c_whh_ht"] = cce.scope_cc # Move whh_ht to UB ub_whh_ht = tvm.compute(matmul_res_shape, lambda *i: l0c_whh_ht(*i), name='ub_whh_ht') self.tensor_list1["ub_whh_ht"] = ub_whh_ht self.emit_cmd["ub_whh_ht"] = "dma_copy" self.scope_list["ub_whh_ht"] = cce.scope_ubuf # Move cont to UB ub_cont = tvm.compute(self.datas["cont"].shape, lambda *i: self.datas["cont"](*i), name='ub_cont') self.tensor_list1["ub_cont"] = ub_cont self.emit_cmd["ub_cont"] = "dma_copy" self.scope_list["ub_cont"] = cce.scope_ubuf if ub_cont.dtype == "float16" and self.device != "hisi_es": ub_cont_fp32 = tvm.compute( ub_cont.shape, lambda *i: topi.cast(ub_cont(*i), "float32"), name="ub_cont_fp32") self.tensor_list1["ub_cont_fp32"] = ub_cont_fp32 self.emit_cmd["ub_cont_fp32"] = "vector_conv" self.scope_list["ub_cont_fp32"] = cce.scope_ubuf else: ub_cont_fp32 = ub_cont ub_whh_ht_cont = tvm.compute( matmul_res_shape, lambda i0, i1, i2, i3: ub_whh_ht[i0, i1, i2, i3] * ub_cont_fp32[ i1, i2], name='ub_whh_ht_cont') self.tensor_list1["ub_whh_ht_cont"] = ub_whh_ht_cont self.emit_cmd["ub_whh_ht_cont"] = "vector_mul" self.scope_list["ub_whh_ht_cont"] = cce.scope_ubuf # Matmul accumulation wht_xt_bias_h + whh_ht_cont ub_ht_tmp1 = tvm.compute( matmul_res_shape, lambda *i: wht_xt_bias_h(*i) + ub_whh_ht_cont(*i), name="ub_ht_tmp1") self.tensor_list1["ub_ht_tmp1"] = ub_ht_tmp1 self.emit_cmd["ub_ht_tmp1"] = "vector_add" self.scope_list["ub_ht_tmp1"] = cce.scope_ubuf return ub_ht_tmp1
def _dynamic_gru_inner(input_list, custom_list): input_x = input_list[0] weight1 = input_list[1] weight2 = input_list[2] bias1 = input_list[3] bias2 = input_list[4] s_init_h_gm = input_list[5] s_state_h_gm_last = input_list[6] is_gate_output = custom_list[0] is_first_round = custom_list[1] is_global_init = custom_list[2] input_dtype = 'float16' bias_dtype = bias1.dtype fp16_input_output = bias_dtype == 'float16' shape_x_input = input_x.shape shape_w1_input = weight1.shape w1_size = 2 w2_size = 1 t_size = shape_x_input[0].value m_size = shape_x_input[2].value k_size = shape_w1_input[1].value hidden_size = shape_w1_input[3].value in_x = k_size - hidden_size shape_b_1 = (1, k_size, w1_size, hidden_size, 16, 16) shape_b_2 = (1, k_size, w2_size, hidden_size, 16, 16) shape_c_1 = (1, w1_size, hidden_size, m_size, 16, 16) shape_c_2 = (1, w2_size, hidden_size, m_size, 16, 16) shape_bias_1 = (1, w1_size, hidden_size, 1, 1, 16) shape_bias_2 = (1, hidden_size, 1, 1, 16) shape_i = (1, hidden_size, m_size, 16, 16) shape_i_t = (t_size, hidden_size, m_size, 16, 16) k0_size = 16 if is_first_round and not is_global_init: s_state_h = tvm.compute( shape_i, lambda *indices: tvm.const(0.0, dtype='float32'), name='s_state_h') s_state_h_fp16 = tvm.compute( shape_i, lambda *indices: s_state_h(*indices).astype('float16'), name="s_state_h_fp16") else: last_h = s_init_h_gm if is_first_round else s_state_h_gm_last if fp16_input_output: s_state_h_fp16 = tvm.compute(shape_i, lambda *indices: last_h(*indices), name='s_state_h_fp16') s_state_h = tvm.compute( shape_i, lambda *indices: s_state_h_fp16(*indices).astype('float32'), name="s_state_h") else: s_state_h = tvm.compute(shape_i, lambda *indices: last_h(*indices), name='s_state_h') s_state_h_fp16 = tvm.compute( shape_i, lambda *indices: s_state_h(*indices).astype('float16'), name="s_state_h_fp16") # compute # input and s_state_h need first to ub and cast to float16 shape_a_z_bigz = (1, m_size, k_size, 16, 16) # input and s_start_h is Nz, need trans to zZ # so change axis 1 and 2 a_l1_1 = tvm.compute( shape_a_z_bigz, lambda *indice: tvm.select( indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[ 3], indice[4]], s_state_h_fp16[0, indice[2] - in_x, indice[1], indice[3], indice[4]]), name="a_l1_1", tag="concat") b_l1_1 = tvm.compute(shape_b_1, lambda *indices: weight1(*indices), name='b_l1_1') a_l0a_1 = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1_1(*indices), name="a_l0a_1") b_l0b_1 = tvm.compute(shape_b_1, lambda *indices: b_l1_1(*indices), name="b_l0b_1") k1_1 = tvm.reduce_axis((0, k_size), name='k1_1') k0_1 = tvm.reduce_axis((0, k0_size), name='k0_1') c_l0c_1 = tvm.compute(shape_c_1, lambda t, nb_0, nb_1, mb, mp, np: tvm.sum((a_l0a_1[t, mb, k1_1, mp, k0_1] * \ b_l0b_1[t, k1_1, nb_0, nb_1, np, k0_1]) \ .astype('float32'), axis=[k1_1, k0_1]), name='c_l0c_1') c_ub_1 = tvm.compute(shape_c_1, lambda *indices: c_l0c_1(*indices), name="c_ub_1") bias_ub_1 = tvm.compute(shape_bias_1, lambda *indices: bias1(*indices), name='bias_ub_1') bias_ub_1_fp32 = bias_ub_1 if fp16_input_output: bias_ub_1_fp32 = tvm.compute( shape_bias_1, lambda *indices: bias_ub_1(*indices).astype('float32'), name="bias_ub_1_fp32") bias_bc_ub_1 = tbe.broadcast(bias_ub_1_fp32, shape_c_1) c_ub_bias_1 = tbe.vadd(c_ub_1, bias_bc_ub_1) # split matmul res r_t_index = 0 i_t_index = 1 r_t = tvm.compute( shape_i, lambda t, i, j, k, l: c_ub_bias_1(t, r_t_index, i, j, k, l), name="r_t") i_t = tvm.compute( shape_i, lambda t, i, j, k, l: c_ub_bias_1(t, i_t_index, i, j, k, l), name="i_t") r_t_sigmoid = _sigmoid_compute(r_t) i_t_sigmoid = _sigmoid_compute(i_t) r_t_mid = r_t_sigmoid i_t_mid = i_t_sigmoid if is_gate_output: if fp16_input_output: r_t_sigmoid_fp16 = tvm.compute( shape_i, lambda *indices: r_t_sigmoid(*indices).astype('float16'), name="r_t_sigmoid_fp16") i_t_sigmoid_fp16 = tvm.compute( shape_i, lambda *indices: i_t_sigmoid(*indices).astype('float16'), name="i_t_sigmoid_fp16") r_t_gm = tvm.compute(shape_i, lambda *indices: r_t_sigmoid_fp16(*indices), name="r_t_gm") i_t_gm = tvm.compute(shape_i, lambda *indices: i_t_sigmoid_fp16(*indices), name="i_t_gm") r_t_gm_back = tvm.compute(shape_i, lambda *indices: r_t_gm(*indices), name="r_t_gm_back") i_t_gm_back = tvm.compute(shape_i, lambda *indices: i_t_gm(*indices), name="i_t_gm_back") r_t_gm_back_fp32 = tvm.compute( shape_i, lambda *indices: r_t_gm_back(*indices).astype('float32'), name="r_t_gm_back_fp32") i_t_gm_back_fp32 = tvm.compute( shape_i, lambda *indices: i_t_gm_back(*indices).astype('float32'), name="i_t_gm_back_fp32") r_t_mid = r_t_gm_back_fp32 i_t_mid = i_t_gm_back_fp32 else: r_t_gm = tvm.compute(shape_i, lambda *indices: r_t_sigmoid(*indices), name="r_t_gm") i_t_gm = tvm.compute(shape_i, lambda *indices: i_t_sigmoid(*indices), name="i_t_gm") r_t_gm_back = tvm.compute(shape_i, lambda *indices: r_t_gm(*indices), name="r_t_gm_back") i_t_gm_back = tvm.compute(shape_i, lambda *indices: i_t_gm(*indices), name="i_t_gm_back") r_t_mid = r_t_gm_back i_t_mid = i_t_gm_back r_t_h = tbe.vmul(r_t_mid, s_state_h) r_t_h_fp16 = \ tvm.compute(shape_i, lambda *indices: r_t_h(*indices).astype(input_dtype), name="r_t_h_fp16") # second matmul a_l1_2 = tvm.compute( shape_a_z_bigz, lambda *indice: tvm.select( indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[ 3], indice[4]], r_t_h_fp16[0, indice[2] - in_x, indice[1], indice[3], indice[4]]), name="a_l1_2", tag="concat") b_l1_2 = tvm.compute(shape_b_2, lambda *indices: weight2(*indices), name='b_l1_2') a_l0a_2 = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1_2(*indices), name="a_l0a_2") b_l0b_2 = tvm.compute(shape_b_2, lambda *indices: b_l1_2(*indices), name="b_l0b_2") k1_2 = tvm.reduce_axis((0, k_size), name='k1_2') k0_2 = tvm.reduce_axis((0, k0_size), name='k0_2') c_l0c_2 = tvm.compute(shape_c_2, lambda t, nb_0, nb_1, mb, mp, np: tvm.sum((a_l0a_2[t, mb, k1_2, mp, k0_2] * \ b_l0b_2[t, k1_2, nb_0, nb_1, np, k0_2]) \ .astype('float32'), axis=[k1_2, k0_2]), name='c_l0c_2') c_ub_2 = tvm.compute(shape_i, lambda t, h, m, i, j: c_l0c_2(t, 0, h, m, i, j), name="c_ub_2") bias_ub_2 = tvm.compute(shape_bias_2, lambda t, h, m, i, j: bias2(t, h, m, i, j), name='bias_ub_2') bias_ub_2_fp32 = bias_ub_2 if fp16_input_output: bias_ub_2_fp32 = tvm.compute( shape_bias_2, lambda *indices: bias_ub_2(*indices).astype('float32'), name="bias_ub_2_fp32") bias_bc_ub_2 = tbe.broadcast(bias_ub_2_fp32, shape_i) c_ub_bias_2 = tbe.vadd(c_ub_2, bias_bc_ub_2) h_t_tanh = _tanh_compute(c_ub_bias_2) h_t_tanh_mid = h_t_tanh if is_gate_output: if fp16_input_output: h_t_tanh_fp16 = tvm.compute( shape_i, lambda *indices: h_t_tanh(*indices).astype('float16'), name="h_t_tanh_fp16") n_t_gm = tvm.compute(shape_i, lambda *indices: h_t_tanh_fp16(*indices), name="n_t_gm") n_t_gm_back = tvm.compute(shape_i, lambda *indices: n_t_gm(*indices), name="n_t_gm_back") n_t_gm_back_fp32 = tvm.compute( shape_i, lambda *indices: n_t_gm_back(*indices).astype('float32'), name="n_t_gm_back_fp32") h_t_tanh_mid = n_t_gm_back_fp32 else: n_t_gm = tvm.compute(shape_i, lambda *indices: h_t_tanh(*indices), name="n_t_gm") n_t_gm_back = tvm.compute(shape_i, lambda *indices: n_t_gm(*indices), name="n_t_gm_back") h_t_tanh_mid = n_t_gm_back c_t_tmp1 = tbe.vsub(s_state_h, h_t_tanh_mid) c_t_tmp2 = tbe.vmul(c_t_tmp1, i_t_mid) update_h = tbe.vadd(c_t_tmp2, h_t_tanh_mid) update_h_ub = update_h if fp16_input_output: update_h_fp16 = tvm.compute( shape_i_t, lambda *indices: update_h(*indices).astype('float16'), name="update_h_fp16") update_h_ub = update_h_fp16 update_y_gm = tvm.compute(shape_i_t, lambda t, i, j, k, l: update_h_ub(0, i, j, k, l), name="update_y_gm") update_y_gm_back = tvm.compute( shape_i_t, lambda t, i, j, k, l: update_y_gm(0, i, j, k, l), name="update_y_gm_back") update_h_gm = tvm.compute( shape_i_t, lambda t, i, j, k, l: update_y_gm_back(0, i, j, k, l), name="update_h_gm") # end compute # schedule s = tvm.schedule.create_schedule([update_h_gm.op]) def gen_reversed_subgraph_list(out_tensor, tensor_list): """ traverse tensors by Depth-First-Search """ if out_tensor is None: return stack = [out_tensor] visited_list = [] while stack: cur_tensor = stack.pop() visited_list.append(cur_tensor) for in_tensor in cur_tensor.op.input_tensors: if in_tensor not in visited_list: stack.append(in_tensor) if "elewise" in in_tensor.op.tag or \ "broadcast" == in_tensor.op.tag: if in_tensor not in tensor_list: tensor_list.append(in_tensor) elewise_tensors_r_t_h_fp16 = [] gen_reversed_subgraph_list(r_t_h_fp16, elewise_tensors_r_t_h_fp16) elewise_tensors = [] tmp_tensors = [] gen_reversed_subgraph_list(update_h_gm, tmp_tensors) for i in tmp_tensors: if i not in elewise_tensors_r_t_h_fp16: elewise_tensors.append(i) # set scope s[s_state_h].set_scope(tbe_platform.scope_ubuf) s[s_state_h_fp16].set_scope(tbe_platform.scope_ubuf) s[a_l1_1].set_scope(tbe_platform.scope_cbuf) s[b_l1_1].set_scope(tbe_platform.scope_cbuf) s[a_l0a_1].set_scope(tbe_platform.scope_ca) s[b_l0b_1].set_scope(tbe_platform.scope_cb) s[c_l0c_1].set_scope(tbe_platform.scope_cc) s[c_ub_1].set_scope(tbe_platform.scope_ubuf) s[bias_ub_1].set_scope(tbe_platform.scope_ubuf) s[bias_bc_ub_1].set_scope(tbe_platform.scope_ubuf) s[r_t_h_fp16].set_scope(tbe_platform.scope_ubuf) s[a_l1_2].set_scope(tbe_platform.scope_cbuf) s[b_l1_2].set_scope(tbe_platform.scope_cbuf) s[a_l0a_2].set_scope(tbe_platform.scope_ca) s[b_l0b_2].set_scope(tbe_platform.scope_cb) s[c_l0c_2].set_scope(tbe_platform.scope_cc) s[c_ub_2].set_scope(tbe_platform.scope_ubuf) s[bias_ub_2].set_scope(tbe_platform.scope_ubuf) s[bias_bc_ub_2].set_scope(tbe_platform.scope_ubuf) s[update_y_gm_back].set_scope(tbe_platform.scope_ubuf) if is_gate_output: s[r_t_gm_back].set_scope(tbe_platform.scope_ubuf) s[i_t_gm_back].set_scope(tbe_platform.scope_ubuf) s[n_t_gm_back].set_scope(tbe_platform.scope_ubuf) if fp16_input_output: s[r_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf) s[i_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf) s[h_t_tanh_fp16].set_scope(tbe_platform.scope_ubuf) s[r_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf) s[i_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf) s[n_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf) if fp16_input_output: s[bias_ub_1_fp32].set_scope(tbe_platform.scope_ubuf) s[bias_ub_2_fp32].set_scope(tbe_platform.scope_ubuf) s[update_h_fp16].set_scope(tbe_platform.scope_ubuf) # compute inline compute_inline_tensors = [i_t, r_t] for tensor in compute_inline_tensors: s[tensor].compute_inline() # matmul tiling factor_l1_m, factor_l1_n, factor_l1_k, factor_l0_m, factor_l0_n, factor_l0_k = \ _get_tiling(m_size, k_size, hidden_size) l1_n_outer_1, l1_n_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[2], factor=factor_l1_n) l1_m_outer_1, l1_m_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[3], factor=factor_l1_m) l1_k_outer_1, l1_k_inner_1 = s[c_l0c_1].split(c_l0c_1.op.reduce_axis[0], factor=factor_l1_k) l0_n_outer_1, l0_n_inner_1 = s[c_l0c_1].split(l1_n_inner_1, factor=factor_l0_n) l0_m_outer_1, l0_m_inner_1 = s[c_l0c_1].split(l1_m_inner_1, factor=factor_l0_m) l0_k_outer_1, l0_k_inner_1 = s[c_l0c_1].split(l1_k_inner_1, factor=factor_l0_k) s[c_l0c_1].reorder(c_l0c_1.op.axis[0], l1_n_outer_1, l1_k_outer_1, c_l0c_1.op.axis[1], l1_m_outer_1, l0_n_outer_1, l0_m_outer_1, l0_k_outer_1, l0_n_inner_1, l0_m_inner_1, c_l0c_1.op.axis[4], c_l0c_1.op.axis[5], l0_k_inner_1, c_l0c_1.op.reduce_axis[1]) s[a_l1_1].double_buffer() s[b_l1_1].double_buffer() s[a_l0a_1].double_buffer() s[b_l0b_1].double_buffer() s[c_l0c_1].double_buffer() s[c_ub_1].double_buffer() s[a_l1_1].compute_at(s[c_l0c_1], l1_k_outer_1) s[b_l1_1].compute_at(s[c_l0c_1], c_l0c_1.op.axis[1]) s[a_l0a_1].compute_at(s[c_l0c_1], l1_k_outer_1) s[b_l0b_1].compute_at(s[c_l0c_1], l0_k_outer_1) c_ub_bias_1_outer, c_ub_bias_1_inner = s[c_ub_bias_1].split( c_ub_bias_1.op.axis[2], factor=factor_l1_n) s[c_ub_bias_1].reorder(c_ub_bias_1.op.axis[0], c_ub_bias_1_outer, c_ub_bias_1.op.axis[1], c_ub_bias_1_inner, c_ub_bias_1.op.axis[3], c_ub_bias_1.op.axis[4], c_ub_bias_1.op.axis[5]) s[c_l0c_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[c_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[bias_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[bias_bc_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) if fp16_input_output: s[bias_ub_1_fp32].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[c_ub_bias_1].emit_insn(c_ub_bias_1.op.axis[1], 'vector_add') r_t_h_fp16_outer, r_t_h_fp16_inner = s[r_t_h_fp16].split( r_t_h_fp16.op.axis[1], factor=factor_l1_n) for tensor in elewise_tensors_r_t_h_fp16: s[tensor].set_scope(tbe_platform.scope_ubuf) if tensor == c_ub_bias_1: continue s[tensor].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) insn = _get_emit_insn_map(tensor) s[tensor].emit_insn(tensor.op.axis[0], insn) if is_gate_output: s[r_t_gm].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) s[r_t_gm_back].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) if fp16_input_output: s[r_t_sigmoid_fp16].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) s[r_t_gm_back_fp32].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) s[r_t_h_fp16].emit_insn(r_t_h_fp16_inner, 'vector_conv') l1_n_outer_2, l1_n_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[2], factor=factor_l1_n) l1_m_outer_2, l1_m_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[3], factor=factor_l1_m) l1_k_outer_2, l1_k_inner_2 = s[c_l0c_2].split(c_l0c_2.op.reduce_axis[0], factor=factor_l1_k) l0_n_outer_2, l0_n_inner_2 = s[c_l0c_2].split(l1_n_inner_2, factor=factor_l0_n) l0_m_outer_2, l0_m_inner_2 = s[c_l0c_2].split(l1_m_inner_2, factor=factor_l0_m) l0_k_outer_2, l0_k_inner_2 = s[c_l0c_2].split(l1_k_inner_2, factor=factor_l0_k) s[c_l0c_2].reorder(c_l0c_2.op.axis[0], l1_n_outer_2, l1_k_outer_2, c_l0c_2.op.axis[1], l1_m_outer_2, l0_n_outer_2, l0_m_outer_2, l0_k_outer_2, l0_n_inner_2, l0_m_inner_2, c_l0c_2.op.axis[4], c_l0c_2.op.axis[5], l0_k_inner_2, c_l0c_2.op.reduce_axis[1]) s[a_l1_2].double_buffer() s[b_l1_2].double_buffer() s[a_l0a_2].double_buffer() s[b_l0b_2].double_buffer() s[c_l0c_2].double_buffer() s[c_ub_2].double_buffer() s[a_l1_2].compute_at(s[c_l0c_2], l1_k_outer_2) s[b_l1_2].compute_at(s[c_l0c_2], c_l0c_2.op.axis[1]) s[a_l0a_2].compute_at(s[c_l0c_2], l1_k_outer_2) s[b_l0b_2].compute_at(s[c_l0c_2], l0_k_outer_2) update_h_gm_outer, update_h_gm_inner = s[update_h_gm].split( update_h_gm.op.axis[1], factor=factor_l1_n) s[c_l0c_2].compute_at(s[update_h_gm], update_h_gm_outer) s[c_ub_2].compute_at(s[update_h_gm], update_h_gm_outer) s[bias_ub_2].compute_at(s[update_h_gm], update_h_gm_outer) s[bias_bc_ub_2].compute_at(s[update_h_gm], update_h_gm_outer) s[c_ub_bias_2].compute_at(s[update_h_gm], update_h_gm_outer) s[update_y_gm].compute_at(s[update_h_gm], update_h_gm_outer) s[update_y_gm_back].compute_at(s[update_h_gm], update_h_gm_outer) if fp16_input_output: s[bias_ub_2_fp32].compute_at(s[update_h_gm], update_h_gm_outer) s[update_h_fp16].compute_at(s[update_h_gm], update_h_gm_outer) if is_gate_output: s[i_t_gm].compute_at(s[update_h_gm], update_h_gm_outer) s[i_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer) s[n_t_gm].compute_at(s[update_h_gm], update_h_gm_outer) s[n_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer) if fp16_input_output: s[i_t_sigmoid_fp16].compute_at(s[update_h_gm], update_h_gm_outer) s[i_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer) s[h_t_tanh_fp16].compute_at(s[update_h_gm], update_h_gm_outer) s[n_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer) for tensor in elewise_tensors: s[tensor].set_scope(tbe_platform.scope_ubuf) s[tensor].compute_at(s[update_h_gm], update_h_gm_outer) insn = _get_emit_insn_map(tensor) s[tensor].emit_insn(tensor.op.axis[0], insn) # emit insn if is_first_round and not is_global_init: s[s_state_h].emit_insn(s_state_h.op.axis[0], 'broadcast') s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'vector_conv') else: if fp16_input_output: s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'dma_copy') s[s_state_h].emit_insn(s_state_h.op.axis[0], 'vector_conv') else: s[s_state_h].emit_insn(s_state_h.op.axis[0], 'dma_copy') s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'vector_conv') s[a_l1_1].emit_insn(a_l1_1.op.axis[0], 'dma_copy') s[b_l1_1].emit_insn(b_l1_1.op.axis[0], 'dma_copy') s[a_l0a_1].emit_insn(a_l0a_1.op.axis[0], 'dma_copy') s[b_l0b_1].emit_insn(b_l0b_1.op.axis[0], 'dma_copy') mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_1, l0_k_outer_1]} s[c_l0c_1].emit_insn(l0_n_inner_1, 'mad', mad_dict) s[c_ub_1].emit_insn(c_ub_1.op.axis[0], 'dma_copy') s[bias_ub_1].emit_insn(bias_ub_1.op.axis[0], 'dma_copy') if fp16_input_output: s[bias_ub_1_fp32].emit_insn(bias_ub_1_fp32.op.axis[0], 'vector_conv') s[bias_ub_2_fp32].emit_insn(bias_ub_2_fp32.op.axis[0], 'vector_conv') s[update_h_fp16].emit_insn(update_h_fp16.op.axis[0], 'vector_conv') s[bias_bc_ub_1].emit_insn(bias_bc_ub_1.op.axis[0], 'unified_broadcast') s[a_l1_2].emit_insn(a_l1_2.op.axis[0], 'dma_copy') s[b_l1_2].emit_insn(b_l1_2.op.axis[0], 'dma_copy') s[a_l0a_2].emit_insn(a_l0a_2.op.axis[0], 'dma_copy') s[b_l0b_2].emit_insn(b_l0b_2.op.axis[0], 'dma_copy') mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_2, l0_k_outer_2]} s[c_l0c_2].emit_insn(l0_n_inner_2, 'mad', mad_dict) s[c_ub_2].emit_insn(c_ub_2.op.axis[0], 'dma_copy') s[bias_ub_2].emit_insn(bias_ub_2.op.axis[0], 'dma_copy') s[bias_bc_ub_2].emit_insn(bias_bc_ub_2.op.axis[0], 'unified_broadcast') s[update_y_gm].emit_insn(update_y_gm.op.axis[0], 'dma_copy') s[update_y_gm_back].emit_insn(update_y_gm_back.op.axis[0], 'phony_insn') s[update_y_gm_back].reused_by(update_h_ub) if is_gate_output: s[r_t_gm].emit_insn(r_t_gm.op.axis[0], 'dma_copy') s[i_t_gm].emit_insn(i_t_gm.op.axis[0], 'dma_copy') s[n_t_gm].emit_insn(n_t_gm.op.axis[0], 'dma_copy') s[r_t_gm_back].emit_insn(r_t_gm_back.op.axis[0], 'phony_insn') s[i_t_gm_back].emit_insn(i_t_gm_back.op.axis[0], 'phony_insn') s[n_t_gm_back].emit_insn(n_t_gm_back.op.axis[0], 'phony_insn') if fp16_input_output: s[r_t_sigmoid_fp16].emit_insn(r_t_sigmoid_fp16.op.axis[0], 'vector_conv') s[i_t_sigmoid_fp16].emit_insn(i_t_sigmoid_fp16.op.axis[0], 'vector_conv') s[h_t_tanh_fp16].emit_insn(h_t_tanh_fp16.op.axis[0], 'vector_conv') s[r_t_gm_back_fp32].emit_insn(r_t_gm_back_fp32.op.axis[0], 'phony_insn') s[i_t_gm_back_fp32].emit_insn(i_t_gm_back_fp32.op.axis[0], 'phony_insn') s[n_t_gm_back_fp32].emit_insn(n_t_gm_back_fp32.op.axis[0], 'phony_insn') s[r_t_gm_back_fp32].reused_by(r_t_sigmoid) s[i_t_gm_back_fp32].reused_by(i_t_sigmoid) s[n_t_gm_back_fp32].reused_by(h_t_tanh) s[r_t_gm_back].reused_by(r_t_sigmoid_fp16) s[i_t_gm_back].reused_by(i_t_sigmoid_fp16) s[n_t_gm_back].reused_by(h_t_tanh_fp16) else: s[r_t_gm_back].reused_by(r_t_sigmoid) s[i_t_gm_back].reused_by(i_t_sigmoid) s[n_t_gm_back].reused_by(h_t_tanh) s[update_h_gm].emit_insn(update_h_gm_inner, 'dma_copy') output_list = [update_y_gm, update_h_gm] if is_gate_output: output_list.append(r_t_gm) output_list.append(i_t_gm) output_list.append(n_t_gm) return output_list, s
def softmax_cross_entropy_with_logits_compute_ex(input_features, input_labels): """ Computes softmax cross entropy cost. softmax = e^(x-max) / ∑(e^(x-max)) log(softmax) = (x-max) - log(∑e^(x-max)) cross_entropy = -∑(y * log(softmax)) Parameters # ---------- input_features: TVM tensor input tensor contains shape and dtype attributes. source data type support "float16", "float32". input_labels: TVM tensor input tensor contains shape and dtype attributes. Must have the same type as 'input_features'. output_loss: dict data of output. Must have the same type as 'input_features'. output_backprop: dict data of output. Must have the same type as 'input_features'. kernel_name: str kernel name, default value is "softmax_cross_entropy_with_logits" Returns: res: TVM tensor output tensor. Has the same type as "input_features". """ shape_features = te.lang.cce.util.shape_to_list(input_features.shape) shape_labels = te.lang.cce.util.shape_to_list(input_labels.shape) dtype = input_features.dtype.lower() if list(shape_features) != list(shape_labels): shape_features, shape_labels, shape_broadcast = \ broadcast_shapes(shape_features, shape_labels, param_name_input1="input_features", param_name_input2="input_labels") input_features = te.lang.cce.broadcast(input_features, shape_broadcast, dtype) input_labels = te.lang.cce.broadcast(input_labels, shape_broadcast, dtype) else: shape_broadcast = shape_features if dtype == "float16": input_features = te.lang.cce.cast_to(input_features, "float32") input_labels = te.lang.cce.cast_to(input_labels, "float32") with tvm.tag_scope("last_axis_reduce_max"): reduce_axis = tvm.reduce_axis((0, shape_broadcast[1]), name="rax0") data_max = tvm.compute( (shape_broadcast[0], 1), lambda upper, lower: tvm.max(input_features[upper, reduce_axis], axis=reduce_axis), name="last_axis_reduce_max") with tvm.tag_scope("elewise_binary_sub_scalar_L1"): data_sub = tvm.compute(input_features.shape, lambda higher, lower: input_features[higher][ lower] - data_max[higher][0], name="manual_sub_0") data_exp = te.lang.cce.vexp(data_sub) data_sum = te.lang.cce.sum(data_exp, axis=-1, keepdims=True) with tvm.tag_scope("elewise_binary_div"): data_div = tvm.compute(data_exp.shape, lambda higher, lower: data_exp[higher][lower] / data_sum[higher][0], name="manual_div_0") data_log_tmp = te.lang.cce.vlog(data_sum) with tvm.tag_scope("elewise_get_L1_workspace"): fake_buffer = tvm.compute( data_sub.shape, lambda higher, lower: tvm.const(0, "float32"), name="get_L1_workspace") with tvm.tag_scope("elewise_binary_sub"): data_log = tvm.compute(data_sub.shape, lambda higher, lower: fake_buffer[higher][lower] - data_log_tmp[higher][0], name="manual_sub_1") data_mul = te.lang.cce.vmul(input_labels, data_log) with tvm.tag_scope("last_axis_reduce_sum_reuse"): reduce_axis = tvm.reduce_axis((0, shape_broadcast[1]), name="rax1") loss = tvm.compute( (shape_broadcast[0], 1), lambda upper, lower: tvm.sum(data_mul[upper, reduce_axis], axis=reduce_axis), name="last_axis_reduce_sum_reuse") loss = te.lang.cce.vmuls(loss, SCALAR_MINUS_ONE) backprop = te.lang.cce.vsub(data_div, input_labels) if dtype == "float16": loss = te.lang.cce.cast_to(loss, "float16") backprop = te.lang.cce.cast_to(backprop, "float16") res = [loss, backprop] return res
def get_matmul_tensor(x, h, c, w, b, build_list, tensor_list, scope_list, operation_list, is_hisi_es): shape_x = x.get("shape") shape_h = h.get("shape") shape_c = c.get("shape") dtype_x = x.get("dtype").lower() dtype_c = c.get("dtype").lower() dtype_b = b.get("dtype").lower() input_dim, batch_dim = shape_x[0:2] hidden_dim = shape_h[0] output_dim = hidden_dim shape_b = b.get("shape") shape_b = (shape_b[0] // 16, 16) shape_xh = (batch_dim, input_dim + hidden_dim, C0, C0) shape_w = w.get("shape") shape_w_split = list(shape_w) shape_w_split[1] = shape_w_split[1] // 4 # Inputs in gm tensor_x = tvm.placeholder(shape_x, name='tensor_x', dtype=dtype_x) tensor_h = tvm.placeholder(shape_h, name='tensor_h', dtype=dtype_x) tensor_c = tvm.placeholder(shape_c, name='tensor_c', dtype=dtype_c) tensor_w = tvm.placeholder(shape_w, name='tensor_w', dtype=dtype_x) tensor_b = tvm.placeholder(shape_b, name='tensor_b', dtype=dtype_c) build_list["x"] = tensor_x build_list["h"] = tensor_h build_list["c"] = tensor_c build_list["w"] = tensor_w build_list["b"] = tensor_b symbol = ["it", "jt", "ft", "ot"] def _index_w(str_name, *index): if str_name == "it": return index[0], index[1], index[2], index[3] elif str_name == "jt": return index[0], index[1] + output_dim, index[2], index[3] elif str_name == "ft": return index[0], index[1] + output_dim * 2, index[2], index[3] return index[0], index[1] + output_dim * 3, index[2], index[3] def _index_bias(str_name): if str_name == "it": return 0 elif str_name == "jt": return 1 elif str_name == "ft": return 2 return 3 matmul_type = "float32" if is_hisi_es: matmul_type = "float16" for t in symbol: # caoncat x and h into 1 tensor,copy to L1 tensor_xh_l1_tmp = tvm.compute( shape_xh, lambda *indice: tvm.select( indice[1] < input_dim, tensor_x[indice[1], indice[0], indice[ 2], indice[3]], tensor_h[indice[1] - input_dim, indice[0], indice[2], indice[3]]), name="tensor_xh_l1_" + t, tag="concat") tensor_list["tensor_xh_l1_" + t] = tensor_xh_l1_tmp scope_list["tensor_xh_l1_" + t] = cce.scope_cbuf # optimazition: copy one time operation_list["tensor_xh_l1_" + t] = "dma_copy" # copy xh to L1 tensor_xh_l0a_tmp = tvm.compute(shape_xh, lambda *i: tensor_xh_l1_tmp(*i), name='tensor_xh_l0a_' + t) tensor_list["tensor_xh_l0a_" + t] = tensor_xh_l0a_tmp scope_list["tensor_xh_l0a_" + t] = cce.scope_ca # optimazition: copy one time operation_list["tensor_xh_l0a_" + t] = "dma_copy" # copy w to L1 buf tensor_w_l1_tmp = tvm.compute(shape_w_split, lambda *i: tensor_w(*_index_w(t, *i)), name='tensor_w_l1_' + t) tensor_list["tensor_w_l1_" + t] = tensor_w_l1_tmp scope_list["tensor_w_l1_" + t] = cce.scope_cbuf operation_list["tensor_w_l1_" + t] = "dma_copy" # copy W from L1 to L0 B tensor_w_l0b_tmp = tvm.compute(shape_w_split, lambda *i: tensor_w_l1_tmp(*i), name='tensor_w_l0b_' + t) tensor_list["tensor_w_l0b_" + t] = tensor_w_l0b_tmp scope_list["tensor_w_l0b_" + t] = cce.scope_cb operation_list["tensor_w_l0b_" + t] = "dma_copy" # copy bias to ubuf ,split the tensor_b_ub_tmp = tvm.compute( shape_b, lambda i0, i1: tensor_b[_index_bias(t) * output_dim + i0, i1], name='tensor_b_ub_' + t) tensor_list["tensor_b_ub_" + t] = tensor_b_ub_tmp scope_list["tensor_b_ub_" + t] = cce.scope_ubuf operation_list["tensor_b_ub_" + t] = "dma_copy" # tensor_b_ub_true_tmp = tensor_b_ub_tmp if not is_hisi_es and dtype_b == "float16": tensor_b_ub_true_tmp = tvm.compute( shape_b, lambda *i: topi.cast(tensor_b_ub_tmp(*i), "float32"), name="tensor_b_ub_true_" + t) tensor_list["tensor_b_ub_true_" + t] = tensor_b_ub_true_tmp scope_list["tensor_b_ub_true_" + t] = cce.scope_ubuf operation_list["tensor_b_ub_true_" + t] = "vector_conv" # broadcast bias from [ouput_dim//16,16] to [output_dim//16,N//16,16,16] tensor_b_loc_tmp = tvm.compute( shape_h, lambda i0, i1, i2, i3: tensor_b_ub_true_tmp[i0, i3], name='tensor_b_loc_' + t) tensor_list["tensor_b_loc_" + t] = tensor_b_loc_tmp scope_list["tensor_b_loc_" + t] = cce.scope_cc operation_list["tensor_b_loc_" + t] = "dma_copy" # DO MATMUL reduce_kb = tvm.reduce_axis((0, input_dim + output_dim), name='reduce_kb_' + t) reduce_kp = tvm.reduce_axis((0, C0), name='reduce_kp_' + t) tensor_matmul_l0c_tmp = tvm.compute( shape_h, lambda nb, mb, mp, np: tvm.sum((tensor_xh_l0a_tmp[ mb, reduce_kb, mp, reduce_kp] * tensor_w_l0b_tmp[ reduce_kb, nb, np, reduce_kp]).astype(matmul_type), axis=[reduce_kb, reduce_kp]), name='tensor_matmul_l0c_' + t, attrs={'input_order': 'positive'}) tensor_list["tensor_matmul_l0c_" + t] = tensor_matmul_l0c_tmp scope_list["tensor_matmul_l0c_" + t] = cce.scope_cc # Matmul accumulation it + b_it tensor_matmul_result_l0c_tmp = tvm.compute( shape_h, lambda *i: tensor_b_loc_tmp(*i) + tensor_matmul_l0c_tmp(*i), name="tensor_matmul_result_l0c_" + t) tensor_list["tensor_matmul_result_l0c_" + t] = tensor_matmul_result_l0c_tmp scope_list["tensor_matmul_result_l0c_" + t] = cce.scope_cc operation_list["tensor_matmul_result_l0c_" + t] = "phony_insn" # copy matmul result from l0c to ub gate_ub_tmp = tvm.compute( shape_h, lambda *i: tensor_list["tensor_matmul_result_l0c_" + t](*i), name=t + "_ub") tensor_list[t + "_ub"] = gate_ub_tmp scope_list[t + "_ub"] = cce.scope_ubuf operation_list[t + "_ub"] = "dma_copy"
def avg_pool3d_compute(x, y, ksize, strides, pads, data_format="NDHWC", kernel_name="avg_pool3d"): """ avg_pool3d compute Parameters ---------- x: input tensor dict y: output tensor dict ksize: kernel size strides: strides padding: padding mode, str data_format: must be "NDHWC" kernel_name: kernel name Returns ------- output tensor """ shape = x.shape if len(ksize) == 5: a_size = (ksize[1] * ksize[2] * ksize[3]) ksize_d = ksize[1] elif len(ksize) == 3: a_size = (ksize[0] * ksize[1] * ksize[2]) ksize_d = ksize[0] else: a_size = ksize[0] * ksize[0] * ksize[0] ksize_d = ksize[0] if len(strides) == 5: stride_d = strides[1] else: stride_d = strides[0] # copy gm to ub tensor_in_ub = tvm.compute(shape, lambda *i: x[i], name="tensor_in_ub") tensor_in_ub_cast = tvm.compute( shape, lambda *i: tensor_in_ub(*i).astype("float32"), name="tensor_in_ub_cast") d_axis = tvm.reduce_axis((0, ksize_d), "d_sum") hw_axis = tvm.reduce_axis((0, shape[3]), "hw_sum") origin_d = shape[1] reduced_d = 1 + (origin_d - ksize_d) // stride_d shape_d_hw = (shape[0], reduced_d, shape[2], 1, shape[4]) tensor_d_hw = tvm.compute( shape_d_hw, lambda n, d, c1, hw, c0: tvm.sum(tensor_in_ub_cast[ n, d * stride_d + d_axis, c1, hw_axis, c0], axis=[d_axis, hw_axis]), name="tensor_d_hw") tensor_a = tvm.compute( shape_d_hw, lambda n, d, c1, hw, c0: tensor_d_hw[n, d, c1, hw, c0] * tvm.const( 1.0 / a_size, dtype="float32"), name="tensor_a") res_cast = tvm.compute(shape_d_hw, lambda *i: tensor_a(*i).astype("float16"), name="res_cast") res = tvm.compute(shape_d_hw, lambda *i: res_cast[i], name='res') return res