def __init__(self, ib_, dtype): self.ib_ = ib_ self.dtype = dtype self.type_size = tbe_platform.cce_intrin.get_bit_len(dtype) // 8 self.cp_align_len = cce_params.BLOCK_REDUCE_INT8 // self.type_size self.unified_buffer_len = tbe_platform.get_soc_spec( tbe_platform.cce_conf.UB_SIZE) // self.type_size self.vec_align_len = cce_params.VECTOR_INST_BLOCK_WIDTH // self.type_size self.uint8_max_value = 255 self.last_block = ib_.allocate("int32", (1, ), name="last_block", scope=cce_params.scope_reg) self.device_core_num = tbe_platform.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.block = tvm.thread_axis("blockIdx.x") self.ib_.scope_attr(self.block, "thread_extent", self.device_core_num) self.input_ub = 0 self.output_ub = 0
def _is_load3d_special(): # limitation by chip: # Ascend910 # load3d not support when only fmap w after padding equals to filter w if get_soc_spec("SOC_VERSION") == 'Ascend910' \ and fmap_h_padding != filter_h \ and fmap_w_padding == filter_w: return False # limitation by chip: # if kernel h,w in [1,11] # and fmap h/w after padding equals to filter h/w # load3d support h,w is 1 if (1 <= filter_h <= 11) and (1 <= filter_w <= 11) \ and (fmap_h_padding == filter_h or fmap_w_padding == filter_w): return True return False
def _min_l1_byte(): # Forth : L1 limitation, Mainly required by chip al1_min_byte = C0 * C0 * 2 if not _is_conv1d_situation(): kl1_min = fmap_w else: kl1_min = (C0 - 1) * stride_w + filter_w_dilation if dedy_w % C0 == 0: bl1_min_byte = filter_h_dilation * kl1_min * C0 * 2 else: bl1_min_byte = (filter_h_dilation + stride_h) * kl1_min * C0 * 2 l1_size = get_soc_spec("L1_SIZE") # L1 size if (al1_min_byte + bl1_min_byte) > l1_size: dict_args = {} dict_args["errCode"] = "E60026" raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
def euclidean_norm_d_compute(x, y, axes, keepdims, kernel_name="euclidean_norm_d"): """ calculating data Parameters ---------- x : TVM tensor the placeholder of input_x y : dict dict of output_y, include keys(shape and dtype) axes: int, list, tuple or NONETYPE the axis for reduce. keepdims: bool or NONETYPE if true, retains reduced dimensions with length 1. kernel_name : str kernel name, default value is "euclidean_norm_d" Returns ------- res: TVM tensor the calculation results """ dtype = x.dtype.lower() shape = x.shape product = get_soc_spec("SOC_VERSION") if product != "Ascend310" and dtype != "float32": x = te.lang.cce.cast_to(x, "float32") one_flag = [] axis = list(axes) for i in axis: one_flag.append(int(shape[i])) if int(len(set(one_flag))) == 1 and int(one_flag[0]) == 1: res = te.lang.cce.vmuls(x, 1) else: res_mul = te.lang.cce.vmul(x, x) res_sum = te.lang.cce.sum(res_mul, axes, keepdims) res = te.lang.cce.vsqrt(res_sum, 1) if res.dtype != dtype: res = te.lang.cce.cast_to(res, dtype) return res
def _check_l1_limitation(): block_size = 16 w_value = dedy_w * stride_w if fmap_w > block_size: h_value_max = filter_h_dilation + 1 elif block_size % fmap_w == 0: h_value_max = filter_h_dilation + block_size // fmap_w - 1 else: h_value_max = filter_h_dilation + block_size // fmap_w + 1 a_l1_size = h_value_max * w_value * \ ((filter_d_dilation - 2)//stride_d + 2) * block_size * 2 b_l1_size = filter_h_dilation * filter_w_dilation * \ filter_d_dilation * block_size * block_size * 2 l1_size = get_soc_spec("L1_SIZE") if (a_l1_size + b_l1_size) > l1_size: dict_args = {'errCode': 'E60022'} raise RuntimeError(dict_args, err_mana.get_error_message(dict_args))
def _check_l1_size_limit(): def _l1fusion_size_limit(l1_size): l1fusion_l1_size = 0 if pads != [0, 0, 0, 0] or [filter_h, filter_w] != [1, 1]: if stride_h > 1 or stride_w > 1: l1fusion_l1_size = l1_size return l1fusion_l1_size c0_size = cce_params.C0_SIZE c0_size_k = cce_params.CUBE_MKN[filter_dtype]['mac'][1] w_value = dedy_w * stride_w if fmap_w > c0_size: h_value_max = filter_h_dilation + 1 elif c0_size % fmap_w == 0: h_value_max = filter_h_dilation + c0_size // fmap_w - 1 else: h_value_max = filter_h_dilation + c0_size // fmap_w + 1 a_l1_size = h_value_max * w_value *\ c0_size_k * BIT_RATIO_DICT.get(out_backprop_dtype) if _is_conv1d_situation(): load3d_stride = 1 a_l1_m_length = (c0_size - 1) * load3d_stride + filter_w_dilation a_l1_size = a_l1_m_length *\ c0_size_k * BIT_RATIO_DICT.get(out_backprop_dtype) b_l1_size = filter_h_dilation * filter_w_dilation *\ c0_size * c0_size_k * BIT_RATIO_DICT.get(filter_dtype) if fusion_para.get("l1_fusion_type") != -1: a_l1_size = _l1fusion_size_limit(a_l1_size) l1_size = get_soc_spec("L1_SIZE") if (a_l1_size + b_l1_size) > l1_size: args_dict = { "errCode": "E60022", } raise RuntimeError(args_dict, err_man.get_error_message(args_dict))
def cosine_embedding_loss_compute(x1, x2, target, output_y, x_shape_broadcat, tgt_shape_broadcast, margin=0, reduction='mean', kernel_name="cosine_embedding_loss"): """ DSL description of the cosine_embedding_loss operator's calculation process Parameters ---------- x1: TVM tensor the placeholder of x1 input data x2: TVM tensor the placeholder of x2 input data target: TVM tensor the placeholder of target input data output_y: TVM tensor the placeholder of beta output data x_shape_broadcat: list, x1 and x2 broadcast shape tgt_shape_broadcast: list x and target broadcast shape margin: float margin, default value is "0.0" reduction: str string indicate reduce method, default value is "mean" kernel_name: str cce kernel name, default value is "group_norm" Returns ------- res: TVM tensor """ cce_plat = cceconf.get_soc_spec('SOC_VERSION') cast_dtype = 'float32' epsilon = tvm.const(1e-12, dtype="float32") if cce_plat == 'Ascend310': cast_dtype = 'float16' epsilon = tvm.const(5e-8, dtype="float16") if x1.dtype.lower() != cast_dtype and x1.dtype.lower() != 'float32': x1 = te.lang.cce.cast_to(x1, cast_dtype) if x2.dtype.lower() != cast_dtype and x2.dtype.lower() != 'float32': x2 = te.lang.cce.cast_to(x2, cast_dtype) target = te.lang.cce.cast_to(target, x1.dtype) x1_broadcast = te.lang.cce.broadcast(x1, x_shape_broadcat) x2_broadcast = te.lang.cce.broadcast(x2, x_shape_broadcat) target_broadcast = te.lang.cce.broadcast(target, tgt_shape_broadcast) # DSL description for cosine similarity compute prod = te.lang.cce.vmul(x1_broadcast, x2_broadcast) mag1 = te.lang.cce.vmul(x1_broadcast, x1_broadcast) mag2 = te.lang.cce.vmul(x2_broadcast, x2_broadcast) mag_square1 = te.lang.cce.sum(mag1, axis=1) mag_square2 = te.lang.cce.sum(mag2, axis=1) x1_epsilon = te.lang.cce.vadds(mag_square1, epsilon) x2_epsilon = te.lang.cce.vadds(mag_square2, epsilon) x1_sqrt = te.lang.cce.vsqrt(x1_epsilon) x2_sqrt = te.lang.cce.vsqrt(x2_epsilon) mode_num = te.lang.cce.vmul(x1_sqrt, x2_sqrt) prod_num = te.lang.cce.sum(prod, axis=1) cos_res = te.lang.cce.vdiv(prod_num, mode_num) # DSL description for 1 - cos(x1, x2) zero_tensor = te.lang.cce.vmuls(target_broadcast, 0) one_tensor = te.lang.cce.vadds(zero_tensor, 1) neg_one_tensor = te.lang.cce.vsub(zero_tensor, one_tensor) pos = te.lang.cce.vsub(one_tensor, cos_res) # DSL description for max(0, cos(x1, x2) - margin) margin_const = tvm.const(margin, dtype="float32") margin_tensor = te.lang.cce.vmuls(one_tensor, margin_const) neg_sub = te.lang.cce.vsub(cos_res, margin_tensor) neg = te.lang.cce.vmax(zero_tensor, neg_sub) # DSL description for output = pos if y == 1 else neg output_pos = te.lang.cce.vcmpsel(target_broadcast, one_tensor, 'eq', pos, zero_tensor) output_neg = te.lang.cce.vcmpsel(target_broadcast, neg_one_tensor, 'eq', neg, zero_tensor) res = te.lang.cce.vadd(output_pos, output_neg) if reduction in ['sum', 'mean']: if reduction == 'mean': num = reduce(lambda x, y: x * y, tgt_shape_broadcast) mean_cof = num**(-1) res = te.lang.cce.vmuls(res, mean_cof) res = te.lang.cce.cast_to(res, 'float32') reduce_axis = [index for index, _ in enumerate(tgt_shape_broadcast)] res_sum = te.lang.cce.sum(res, axis=reduce_axis) return res_sum return te.lang.cce.cast_to(res, 'float32')
def bn_training_reduce_schedule_nd(res, core_num=None): """bn_training_reduce schedule method""" cce_emitinsn_params.cceEmitParamsIns.clear_param() # Prepare extra tensors # Step 1: Get two output tensors # Step 2: Merge two output tensors into Dummy # Step 3: Move UB data to GM tensor output_first = res[0] # Square Sum output_second = res[1] # Sum final_output = tvm.compute( output_first.shape, lambda *indices: output_first(*indices) + output_second(*indices), name="DummyYummySweety") is_cast = False if "cast" in output_second.op.input_tensors[0].name: is_cast = True # Calculate block split factor by axis_n_size and core_num axis_n_size = int(res[0].shape[1]) if not core_num: core_num = int(cceconf.get_soc_spec("CORE_NUM")) # Multi core kernel requires aligned output element_size = cce_util.get_align_factor(output_first.dtype)[1] block_element_num = te.platform.cce_intrin_md.ALIGNMENT_BYTES // element_size estimate_block_split_factor = max(axis_n_size // core_num, 8) nearest_aligned_factor = estimate_block_split_factor % block_element_num # Decrease core_num for aligned output if estimate_block_split_factor < block_element_num and core_num > 1: return bn_training_reduce_schedule_nd(res, core_num - 1) # Round to the nearest block_split_factor = estimate_block_split_factor - nearest_aligned_factor # Calculate UB split ub_size = te.platform.CceProductParams().getParams("Unified_Buffer") // 2 reduce_data_num = 1 reduce_data_factor = 2 if is_cast: reduce_data_factor = 3 for reduce_axis in output_first.op.reduce_axis: reduce_data_num *= int(reduce_axis.dom.extent) reduce_data_num *= reduce_data_factor max_possible_loop = ub_size // (element_size * reduce_data_num) actual_loop = 1 for loop in range(max_possible_loop - 1, 0, -1): if block_split_factor % loop == 0: actual_loop = loop break # Force aligned if multi-core is enabled if actual_loop < block_element_num and actual_loop < block_split_factor and core_num > 1: actual_loop = block_element_num # Find all tensors if is_cast: # With Cast, prepare tensor parameters mul_tensor = output_first.op.input_tensors[0] cast_tensor = mul_tensor.op.input_tensors[0] res_input = cast_tensor.op.input_tensors[0] input_tensor_next = [cast_tensor ] # First compute tensor is cast_tensor ub_tensors = [cast_tensor, mul_tensor, output_first, output_second] else: # Without Cast, prepare tensor parameters cast_tensor = None mul_tensor = output_first.op.input_tensors[0] res_input = mul_tensor.op.input_tensors[0] input_tensor_next = [mul_tensor, output_second ] # First compute tensor is cast_tensor ub_tensors = [mul_tensor, output_first, output_second] # Create original schedule sch = tvm.create_schedule(final_output.op) # //////////////////////////////////// # ///////// DataFlow Control ///////// # //////////////////////////////////// # Read input in input_tensor_ub = sch.cache_read(res_input, cce_params.scope_ubuf, input_tensor_next) ub_tensors.append(input_tensor_ub) # Compute procedure in ubuf for ub_tens in ub_tensors: sch[ub_tens].set_scope(cce_params.scope_ubuf) # //////////////////////////////////// # //////// Split axis Control //////// # //////////////////////////////////// outer, inner = \ sch[final_output].split(sch[final_output].op.axis[1], factor=block_split_factor) ub_outer, ub_inner = sch[final_output].split(inner, factor=actual_loop) sch[final_output].bind(outer, tvm.thread_axis("blockIdx.x")) # //////////////////////////////////// # ///////// Compute Control ////////// # //////////////////////////////////// compute_at_axis = ub_outer for ub_tens in ub_tensors: sch[ub_tens].compute_at(sch[final_output], compute_at_axis) # //////////////////////////////////// # //////////// EmitInsn ////////////// # //////////////////////////////////// def emit_on_self(tensor, axisnum=0, op='dma_copy'): """Do emit insn""" sch[tensor].emit_insn(sch[tensor].op.axis[axisnum], op) def emit_on_self_ex(tensor, axis, op='dma_copy'): """Do emit insn""" sch[tensor].emit_insn(axis, op) # Fake results emit_on_self(input_tensor_ub, 0) if is_cast: emit_on_self(cast_tensor, 0, cast_tensor.op.tag.split('|')[0]) emit_on_self(mul_tensor, 0, mul_tensor.op.tag) sch[output_first].pragma(sch[output_first].op.axis[1], "emit_insn", "bn_reduce_sum") sch[output_second].pragma(sch[output_second].op.axis[1], "emit_insn", "bn_reduce_sum") sch[output_first].double_buffer() sch[output_second].double_buffer() emit_on_self_ex(final_output, ub_inner, "binary_reduce_output_reversed") def new_alloc(dtype, shape, name): """Alloc mem""" new_buffer = tvm.decl_buffer(shape, dtype, name=name, scope="", data=None) return new_buffer out_buffer_sec = new_alloc(final_output.dtype, (block_split_factor, ), "reduce_sec_output_gm") cce_emitinsn_params.cceEmitParamsIns.insert_param( "binary_reduce_output_buffer", out_buffer_sec) tensor_list = [res_input, final_output, out_buffer_sec] return sch, tensor_list
def __init__(self, input_x, auxiliary_coefficients, auxiliary_offset, kernel_name='stn_compute'): self.d_type_x = input_x.get('dtype') self.theta_dtype = auxiliary_coefficients.get('dtype') self.position_dtype = auxiliary_offset.get('dtype') self.shape = input_x.get('shape') self.kernel_name = kernel_name # product_name = tik_get_soc_name.get_soc_name() self.tik_instance = tik.Tik(tik.Dprofile()) self.ai_core_num = tik.Dprofile().get_aicore_num() ub_size_bytes = cce.get_soc_spec( cce.cce_conf.UB_SIZE) // 2 # double buffer self.d_type_bytes_size = cce.cce_intrin.get_bit_len(self.d_type_x) // 8 self.theta_type_bytes_size = cce.cce_intrin.get_bit_len( auxiliary_coefficients.get('dtype')) // 8 self.offset_type_bytes_size = cce.cce_intrin.get_bit_len( auxiliary_offset.get('dtype')) // 8 self.vec_compute_size = 256 # theta size output_h * output_w * 4 * n * c1 self.theta_size = auxiliary_coefficients.get('shape')[0] * auxiliary_coefficients.get('shape')[1] * \ auxiliary_coefficients.get('shape')[2] # output_h * output_w self.output_hw = self.theta_size // 4 // self.shape[0] // self.shape[1] # tiling policy self.total_c1 = self.shape[0] * self.shape[1] self.ub_tensor_size = 16 if self.shape[1] * self.shape[4] * self.d_type_bytes_size > ub_size_bytes * 0.4 else \ self.shape[1] * self.shape[4] self.input_stride = (self.shape[2] * self.shape[3] * self.shape[4] - self.shape[4]) \ * self.d_type_bytes_size // 32 self.output_stride = (self.output_hw * self.shape[4] - self.shape[4]) * self.d_type_bytes_size // 32 self.if_skip_read_ceof = self.ub_tensor_size != 16 # nc1hwc0 c0 = 16 theta type same as input_x self.ub_tensor_len = self.ub_tensor_size * self.d_type_bytes_size // 32 self.input_hw = self.shape[2] * self.shape[3] # ub theta size must be a multiple of 4 and 32 ub_theta_offset_can_use = (ub_size_bytes - self.ub_tensor_size * self.d_type_bytes_size * 2) \ // (self.theta_type_bytes_size + self.offset_type_bytes_size) self.ub_theta_offset_size = ub_theta_offset_can_use - ub_theta_offset_can_use % 4 theta_burst_len = self.ub_theta_offset_size * self.theta_type_bytes_size // 32 offset_burst_len = self.ub_theta_offset_size * self.offset_type_bytes_size // 32 self.ub_theta_offset_size = min( theta_burst_len * 32 // self.theta_type_bytes_size, offset_burst_len * 32 // self.offset_type_bytes_size) self.input_num = reduce(lambda x, y: x * y, input_x.get('shape')) # self.input_num = self.theta_size * 4 # input data self.input_x_gm = self.tik_instance.Tensor(self.d_type_x, (self.input_num, ), name='input_x_gm', scope=tik.scope_gm) # theta matrix self.input_theta_gm = self.tik_instance.Tensor( auxiliary_coefficients.get('dtype'), (self.theta_size, ), name='input_theta_gm', scope=tik.scope_gm) # position offset matrix self.input_position_gm = self.tik_instance.Tensor( auxiliary_offset.get('dtype'), (self.theta_size, ), name='input_position_gm', scope=tik.scope_gm) # output data self.output_y_gm = self.tik_instance.Tensor( self.d_type_x, (self.output_hw * self.shape[0] * self.shape[1] * self.shape[4], ), name='output_y_gm', scope=tik.scope_gm)