This program is free software; you can redistribute it and/or modify it under the terms of the Apache License Version 2.0.You may not use this file except in compliance with the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache License for more details at http://www.apache.org/licenses/LICENSE-2.0 """ from te import tik from te.platform.cce_conf import te_set_l2_mode te_set_l2_mode(1) #size of 310 ai core ub buffer UB_SIZE = 240 * 1024 #batch for N MAX_BATCH = 1 #channel for C MAX_CHANNEL = 1024 #width for W MAX_WIDTH = 32 #height for H MAX_HEIGHT = 32
def matmul_tik_compute(params, kernel_name): te_set_l2_mode(1) tik_instance = tik.Tik() if not isinstance(params, dict): params = params.__dict__ m_size, k_size, n_size = params['M'], params['K'], params['N'] data_type = params["data_type"] m_tiling_size = int(params["m_tiling_size"]) n_tiling_size = int(params["n_tiling_size"]) k_tiling_size = int(params['k_tiling_size']) m_cycle_times = params["m_cycle_times"] n_cycle_times = params["n_cycle_times"] k_cycle_times = params["k_cycle_times"] # Determine the output type if data_type == "float16": C_loc_out_type = "float32" K0 = 16 else: C_loc_out_type = "int32" K0 = 32 block_size = 16 n_thread_num = params['n_thread_num'] m_thread_num = params['m_thread_num'] k_thread_num = params['k_thread_num'] # Occupy the input tensor. output_gm = tik_instance.Tensor(C_loc_out_type, (n_size // block_size, m_size, block_size), name="C_gm", scope=tik.scope_gm) inputa_gm = tik_instance.Tensor(params["data_type"], (k_size // K0, m_size, K0), name="A_gm", scope=tik.scope_gm) inputb_gm = tik_instance.Tensor(params["data_type"], (k_size // K0, n_size, K0), name="B_gm", scope=tik.scope_gm) # Tiling is realized through the for_range() loop. with tik_instance.for_range(0, 2, block_num = 2) as core_id: with tik_instance.for_range(0, n_cycle_times // 2, thread_num=n_thread_num) as n_idx: with tik_instance.for_range(0, m_cycle_times, thread_num=m_thread_num) as m_idx: dst_l0c = tik_instance.Tensor(C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16], name='dst_l0c', scope=tik.scope_cbuf_out) with tik_instance.for_range(0, k_cycle_times, thread_num=k_thread_num) as k_idx: # Calculation result data transfer. inputa_l1 = tik_instance.Tensor(params['data_type'], [k_tiling_size // K0, m_tiling_size, K0], name="A_tiling_l1", scope=tik.scope_cbuf) tik_instance.data_move(inputa_l1, inputa_gm[k_idx * k_tiling_size // K0, m_idx * m_tiling_size, :], 0, k_tiling_size // K0, m_tiling_size, m_size - m_tiling_size, 0) inputb_l1 = tik_instance.Tensor(params["data_type"], [k_tiling_size // K0, n_tiling_size, K0], name="B_tiling_l1", scope=tik.scope_cbuf) if n_size - n_tiling_size > 65535: with tik_instance.for_range(0, k_tiling_size // K0) \ as dma_k_idx: tik_instance.data_move(inputb_l1[dma_k_idx, :, :], inputb_gm[k_idx * k_tiling_size // K0 + dma_k_idx, (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :], 0, 1, n_tiling_size, 0, 0) else: tik_instance.data_move(inputb_l1, inputb_gm[k_idx * k_tiling_size // K0, (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :], 0, k_tiling_size // K0, n_tiling_size, n_size - n_tiling_size, 0) # Call matmul API to matrix multiplication calculation. with tik_instance.if_scope(k_idx == 0): tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size, init_l1out=True) with tik_instance.else_scope(): tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size, init_l1out=False) tik_instance.fixpipe(output_gm[n_tiling_size // 16 * (core_id * n_cycle_times // 2 + n_idx), m_idx * m_tiling_size, :], dst_l0c, n_tiling_size // 16, m_tiling_size * 16 * DTYPE_SIZE[C_loc_out_type]//32, (m_size - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] // 32, 0) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[inputa_gm, inputb_gm], outputs=[output_gm]) return tik_instance
def conv2d_tik_compute(params): te_set_l2_mode(1) tik_instance = tik.Tik(tik.Dprofile(params["arch"], params["version"]), err_msg_level=1) n, c1, h, w, c0 = params["fm_shape"] c1, kh, kw, cout, c0 = params["weight_shape"] stride_h, stride_w = params["stride_list"] dilation_h, dilation_w = params["dilation_list"] pad_top, pad_bot, pad_left, pad_right = params["pad_list"] kh_dilation = (kh - 1) * dilation_h + 1 kw_dilation = (kw - 1) * dilation_w + 1 ho = int(np.ceil((h + pad_top + pad_bot - kh_dilation + 1) / stride_h)) wo = int(np.ceil((w + pad_right + pad_left - kw_dilation + 1) / stride_w)) round_howo = ceil_div(ho * wo, 16) * 16 fm_gm = tik_instance.Tensor(params['fm_dtype'], (n, c1, h, w, c0), name='fm_gm', scope=tik.scope_gm) weight_gm = tik_instance.Tensor(params['weight_type'], (c1, kh, kw, cout, c0), name='weight_gm', scope=tik.scope_gm) if params['dst_gm_type'] in ("int8", "uint8"): dst_gm = tik_instance.Tensor(params['dst_gm_type'], [n, cout // 32, ho, wo, 32], name='dst_gm', scope=tik.scope_gm) else: dst_gm = tik_instance.Tensor(params['dst_gm_type'], [n, cout // 16, ho, wo, 16], name='dst_gm', scope=tik.scope_gm) core_num = 2 pre_core_cout = cout // core_num cout_iter_num = pre_core_cout // params["cout_split_factor"] Cin_blocks = c1 with tik_instance.for_range(0, core_num, block_num=core_num) as cout_o: with tik_instance.for_range(0, cout_iter_num, thread_num=1) as cout_i: weight_L1 = tik_instance.Tensor( params['weight_type'], (Cin_blocks, kh, kw, params["cout_split_factor"], c0), name='weight_l1', scope=tik.scope_cbuf) tik_instance.data_move( weight_L1, weight_gm.flatten()[cout_o * pre_core_cout * c0 + params["cout_split_factor"] * cout_i * c0], 0, Cin_blocks * kh * kw, params["cout_split_factor"], (cout - params["cout_split_factor"]), 0) with tik_instance.for_range(0, n, thread_num=2) as n_index: feature_map_l1 = tik_instance.Tensor(params['fm_dtype'], (c1, h, w, c0), name='feature_map_l1', scope=tik.scope_cbuf) tik_instance.data_move(feature_map_l1, fm_gm[n_index, :, :, :, :], 0, 1, c1 * h * w, 0, 0) dst_l0c = tik_instance.Tensor( params['dst_l0c_type'], [params["cout_split_factor"] // 16, round_howo, 16], name='dst_l0c', scope=tik.scope_cbuf_out) tik_instance.conv2d( dst_l0c, feature_map_l1, weight_L1, (c1, h, w, c0), (Cin_blocks, kh, kw, params["cout_split_factor"], c0), params['stride_list'], params['pad_list'], params['dilation_list'], params['pad_value']) tik_instance.fixpipe( dst_gm[n_index, (cout_o * pre_core_cout + params["cout_split_factor"] * cout_i) // (32 // DTYPE_SIZE[params['dst_gm_type']]), 0, 0, 0], dst_l0c, params["cout_split_factor"] // 16, ho * wo * 16 * DTYPE_SIZE[params['dst_l0c_type']] // 32, 0, 0, extend_params={ "bias": None, "quantize_params": params["quantize_params"] }) tik_instance.BuildCCE(kernel_name=params["kernel_name"], inputs=[fm_gm, weight_gm], outputs=[dst_gm]) return tik_instance
def matmul_tik_compute(params, kernel_name, new_ws=None, cnt=0): te_set_l2_mode(1) tik_instance = tik.Tik(tik.Dprofile('v100', 'mini'), err_msg_level=1) if not isinstance(params, dict): params = params.__dict__ m, k, n = params['M'], params['K'], params['N'] data_type = params["data_type"] m_tiling_size = int(params["m_tiling_size"]) n_tiling_size = int(params["n_tiling_size"]) k_tiling_size = int(params['k_tiling_size']) m_cycle_times = params["m_cycle_times"] n_cycle_times = params["n_cycle_times"] k_cycle_times = params["k_cycle_times"] if data_type == "float16": C_loc_out_type = "float32" K0 = 16 else: C_loc_out_type = "int32" K0 = 32 block_size = 16 n_thread_num = params['n_thread_num'] m_thread_num = params['m_thread_num'] k_thread_num = params['k_thread_num'] C_gm = tik_instance.Tensor(C_loc_out_type, (n // block_size, m, block_size), name="C_gm", scope=tik.scope_gm) A_gm = tik_instance.Tensor(params["data_type"], (k // K0, m, K0), name="A_gm", scope=tik.scope_gm) B_gm = tik_instance.Tensor(params["data_type"], (k // K0, n, K0), name="B_gm", scope=tik.scope_gm) with tik_instance.for_range(0, 2, block_num=2) as core_id: with tik_instance.for_range(0, n_cycle_times // 2, thread_num=n_thread_num) as n_idx: with tik_instance.for_range(0, m_cycle_times, thread_num=m_thread_num) as m_idx: dst_l0c = tik_instance.Tensor( C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16], name='dst_l0c', scope=tik.scope_cbuf_out) with tik_instance.for_range(0, k_cycle_times, thread_num=k_thread_num) as k_idx: A_l1 = tik_instance.Tensor( params['data_type'], [k_tiling_size // K0, m_tiling_size, K0], name="A_tiling_l1", scope=tik.scope_cbuf) tik_instance.data_move( A_l1, A_gm[k_idx * k_tiling_size // K0, m_idx * m_tiling_size, :], 0, k_tiling_size // K0, m_tiling_size, m - m_tiling_size, 0) B_l1 = tik_instance.Tensor( params["data_type"], [k_tiling_size // K0, n_tiling_size, K0], name="B_tiling_l1", scope=tik.scope_cbuf) if n - n_tiling_size > 65535: with tik_instance.for_range(0, k_tiling_size // K0) as dma_k_idx: tik_instance.data_move( B_l1[dma_k_idx, :, :], B_gm[k_idx * k_tiling_size // K0 + dma_k_idx, (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :], 0, 1, n_tiling_size, 0, 0) else: tik_instance.data_move( B_l1, B_gm[k_idx * k_tiling_size // K0, (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :], 0, k_tiling_size // K0, n_tiling_size, n - n_tiling_size, 0) with tik_instance.if_scope(k_idx == 0): tik_instance.matmul(dst_l0c, A_l1, B_l1, m_tiling_size, k_tiling_size, n_tiling_size, init_l1out=True) with tik_instance.else_scope(): tik_instance.matmul(dst_l0c, A_l1, B_l1, m_tiling_size, k_tiling_size, n_tiling_size, init_l1out=False) tik_instance.fixpipe( C_gm[n_tiling_size // 16 * (core_id * n_cycle_times // 2 + n_idx), m_idx * m_tiling_size, :], dst_l0c, n_tiling_size // 16, m_tiling_size * 16 * DTYPE_SIZE[C_loc_out_type] // 32, (m - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] // 32, 0) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[A_gm, B_gm], outputs=[C_gm]) return tik_instance