def __init__(self, ib_, dtype):
        self.ib_ = ib_
        self.dtype = dtype
        self.type_size = tbe_platform.cce_intrin.get_bit_len(dtype) // 8
        self.cp_align_len = cce_params.BLOCK_REDUCE_INT8 // self.type_size

        self.unified_buffer_len = tbe_platform.get_soc_spec(
            tbe_platform.cce_conf.UB_SIZE) // self.type_size
        self.vec_align_len = cce_params.VECTOR_INST_BLOCK_WIDTH // self.type_size
        self.uint8_max_value = 255
        self.last_block = ib_.allocate("int32", (1, ),
                                       name="last_block",
                                       scope=cce_params.scope_reg)

        self.device_core_num = tbe_platform.get_soc_spec(
            tbe_platform.cce_conf.CORE_NUM)
        self.block = tvm.thread_axis("blockIdx.x")
        self.ib_.scope_attr(self.block, "thread_extent", self.device_core_num)

        self.input_ub = 0
        self.output_ub = 0
Esempio n. 2
0
 def _is_load3d_special():
     # limitation by chip:
     # Ascend910
     # load3d not support when only fmap w after padding equals to filter w
     if get_soc_spec("SOC_VERSION") == 'Ascend910' \
         and fmap_h_padding != filter_h \
         and fmap_w_padding == filter_w:
         return False
     # limitation by chip:
     # if kernel h,w in [1,11]
     # and fmap h/w after padding equals to filter h/w
     # load3d support h,w is 1
     if (1 <= filter_h <= 11) and (1 <= filter_w <= 11) \
         and (fmap_h_padding == filter_h or fmap_w_padding == filter_w):
         return True
     return False
Esempio n. 3
0
    def _min_l1_byte():
        # Forth : L1 limitation, Mainly required by chip
        al1_min_byte = C0 * C0 * 2
        if not _is_conv1d_situation():
            kl1_min = fmap_w
        else:
            kl1_min = (C0 - 1) * stride_w + filter_w_dilation
        if dedy_w % C0 == 0:
            bl1_min_byte = filter_h_dilation * kl1_min * C0 * 2
        else:
            bl1_min_byte = (filter_h_dilation + stride_h) * kl1_min * C0 * 2

        l1_size = get_soc_spec("L1_SIZE")  # L1 size
        if (al1_min_byte + bl1_min_byte) > l1_size:
            dict_args = {}
            dict_args["errCode"] = "E60026"
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
Esempio n. 4
0
def euclidean_norm_d_compute(x,
                             y,
                             axes,
                             keepdims,
                             kernel_name="euclidean_norm_d"):
    """
    calculating data

    Parameters
    ----------
    x : TVM tensor
        the placeholder of input_x
    y : dict
        dict of output_y, include keys(shape and dtype)
    axes: int, list, tuple or NONETYPE
        the axis for reduce.
    keepdims: bool or NONETYPE
        if true, retains reduced dimensions with length 1.
    kernel_name : str
        kernel name, default value is "euclidean_norm_d"

    Returns
    -------
    res: TVM tensor
        the calculation results
    """
    dtype = x.dtype.lower()
    shape = x.shape
    product = get_soc_spec("SOC_VERSION")
    if product != "Ascend310" and dtype != "float32":
        x = te.lang.cce.cast_to(x, "float32")
    one_flag = []
    axis = list(axes)
    for i in axis:
        one_flag.append(int(shape[i]))

    if int(len(set(one_flag))) == 1 and int(one_flag[0]) == 1:
        res = te.lang.cce.vmuls(x, 1)
    else:
        res_mul = te.lang.cce.vmul(x, x)
        res_sum = te.lang.cce.sum(res_mul, axes, keepdims)
        res = te.lang.cce.vsqrt(res_sum, 1)

    if res.dtype != dtype:
        res = te.lang.cce.cast_to(res, dtype)
    return res
Esempio n. 5
0
    def _check_l1_limitation():
        block_size = 16
        w_value = dedy_w * stride_w
        if fmap_w > block_size:
            h_value_max = filter_h_dilation + 1
        elif block_size % fmap_w == 0:
            h_value_max = filter_h_dilation + block_size // fmap_w - 1
        else:
            h_value_max = filter_h_dilation + block_size // fmap_w + 1

        a_l1_size = h_value_max * w_value * \
                    ((filter_d_dilation - 2)//stride_d + 2) * block_size * 2
        b_l1_size = filter_h_dilation * filter_w_dilation * \
                    filter_d_dilation * block_size * block_size * 2
        l1_size = get_soc_spec("L1_SIZE")
        if (a_l1_size + b_l1_size) > l1_size:
            dict_args = {'errCode': 'E60022'}
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
Esempio n. 6
0
    def _check_l1_size_limit():
        def _l1fusion_size_limit(l1_size):
            l1fusion_l1_size = 0
            if pads != [0, 0, 0, 0] or [filter_h, filter_w] != [1, 1]:
                if stride_h > 1 or stride_w > 1:
                    l1fusion_l1_size = l1_size
            return l1fusion_l1_size

        c0_size = cce_params.C0_SIZE
        c0_size_k = cce_params.CUBE_MKN[filter_dtype]['mac'][1]
        w_value = dedy_w * stride_w

        if fmap_w > c0_size:
            h_value_max = filter_h_dilation + 1
        elif c0_size % fmap_w == 0:
            h_value_max = filter_h_dilation + c0_size // fmap_w - 1
        else:
            h_value_max = filter_h_dilation + c0_size // fmap_w + 1

        a_l1_size = h_value_max * w_value *\
                    c0_size_k * BIT_RATIO_DICT.get(out_backprop_dtype)
        if _is_conv1d_situation():
            load3d_stride = 1
            a_l1_m_length = (c0_size - 1) * load3d_stride + filter_w_dilation
            a_l1_size = a_l1_m_length *\
                        c0_size_k * BIT_RATIO_DICT.get(out_backprop_dtype)
        b_l1_size = filter_h_dilation * filter_w_dilation *\
                    c0_size * c0_size_k * BIT_RATIO_DICT.get(filter_dtype)
        if fusion_para.get("l1_fusion_type") != -1:
            a_l1_size = _l1fusion_size_limit(a_l1_size)
        l1_size = get_soc_spec("L1_SIZE")
        if (a_l1_size + b_l1_size) > l1_size:
            args_dict = {
                "errCode": "E60022",
            }
            raise RuntimeError(args_dict, err_man.get_error_message(args_dict))
Esempio n. 7
0
def cosine_embedding_loss_compute(x1,
                                  x2,
                                  target,
                                  output_y,
                                  x_shape_broadcat,
                                  tgt_shape_broadcast,
                                  margin=0,
                                  reduction='mean',
                                  kernel_name="cosine_embedding_loss"):
    """
    DSL description of the cosine_embedding_loss operator's calculation process

    Parameters
    ----------
    x1: TVM tensor
        the placeholder of x1 input data
    x2: TVM tensor
        the placeholder of x2 input data
    target: TVM tensor
        the placeholder of target input data
    output_y: TVM tensor
        the placeholder of beta output data
    x_shape_broadcat: list,
        x1 and x2 broadcast shape
    tgt_shape_broadcast: list
        x and target broadcast shape
    margin: float
        margin, default value is "0.0"
    reduction: str
        string indicate reduce method, default value is "mean"
    kernel_name: str
        cce kernel name, default value is "group_norm"

    Returns
    -------
    res: TVM tensor
    """
    cce_plat = cceconf.get_soc_spec('SOC_VERSION')
    cast_dtype = 'float32'
    epsilon = tvm.const(1e-12, dtype="float32")

    if cce_plat == 'Ascend310':
        cast_dtype = 'float16'
        epsilon = tvm.const(5e-8, dtype="float16")

    if x1.dtype.lower() != cast_dtype and x1.dtype.lower() != 'float32':
        x1 = te.lang.cce.cast_to(x1, cast_dtype)

    if x2.dtype.lower() != cast_dtype and x2.dtype.lower() != 'float32':
        x2 = te.lang.cce.cast_to(x2, cast_dtype)

    target = te.lang.cce.cast_to(target, x1.dtype)

    x1_broadcast = te.lang.cce.broadcast(x1, x_shape_broadcat)
    x2_broadcast = te.lang.cce.broadcast(x2, x_shape_broadcat)
    target_broadcast = te.lang.cce.broadcast(target, tgt_shape_broadcast)

    # DSL description for cosine similarity compute
    prod = te.lang.cce.vmul(x1_broadcast, x2_broadcast)

    mag1 = te.lang.cce.vmul(x1_broadcast, x1_broadcast)
    mag2 = te.lang.cce.vmul(x2_broadcast, x2_broadcast)
    mag_square1 = te.lang.cce.sum(mag1, axis=1)
    mag_square2 = te.lang.cce.sum(mag2, axis=1)

    x1_epsilon = te.lang.cce.vadds(mag_square1, epsilon)
    x2_epsilon = te.lang.cce.vadds(mag_square2, epsilon)
    x1_sqrt = te.lang.cce.vsqrt(x1_epsilon)
    x2_sqrt = te.lang.cce.vsqrt(x2_epsilon)
    mode_num = te.lang.cce.vmul(x1_sqrt, x2_sqrt)
    prod_num = te.lang.cce.sum(prod, axis=1)
    cos_res = te.lang.cce.vdiv(prod_num, mode_num)

    # DSL description for 1 - cos(x1, x2)
    zero_tensor = te.lang.cce.vmuls(target_broadcast, 0)
    one_tensor = te.lang.cce.vadds(zero_tensor, 1)

    neg_one_tensor = te.lang.cce.vsub(zero_tensor, one_tensor)
    pos = te.lang.cce.vsub(one_tensor, cos_res)

    # DSL description for max(0, cos(x1, x2) - margin)
    margin_const = tvm.const(margin, dtype="float32")
    margin_tensor = te.lang.cce.vmuls(one_tensor, margin_const)
    neg_sub = te.lang.cce.vsub(cos_res, margin_tensor)
    neg = te.lang.cce.vmax(zero_tensor, neg_sub)

    # DSL description for output = pos if y == 1 else neg
    output_pos = te.lang.cce.vcmpsel(target_broadcast, one_tensor, 'eq', pos,
                                     zero_tensor)
    output_neg = te.lang.cce.vcmpsel(target_broadcast, neg_one_tensor, 'eq',
                                     neg, zero_tensor)
    res = te.lang.cce.vadd(output_pos, output_neg)
    if reduction in ['sum', 'mean']:
        if reduction == 'mean':
            num = reduce(lambda x, y: x * y, tgt_shape_broadcast)
            mean_cof = num**(-1)
            res = te.lang.cce.vmuls(res, mean_cof)
            res = te.lang.cce.cast_to(res, 'float32')

        reduce_axis = [index for index, _ in enumerate(tgt_shape_broadcast)]
        res_sum = te.lang.cce.sum(res, axis=reduce_axis)
        return res_sum

    return te.lang.cce.cast_to(res, 'float32')
Esempio n. 8
0
def bn_training_reduce_schedule_nd(res, core_num=None):
    """bn_training_reduce schedule method"""
    cce_emitinsn_params.cceEmitParamsIns.clear_param()
    # Prepare extra tensors
    # Step 1: Get two output tensors
    # Step 2: Merge two output tensors into Dummy
    # Step 3: Move UB data to GM tensor
    output_first = res[0]  # Square Sum
    output_second = res[1]  # Sum
    final_output = tvm.compute(
        output_first.shape,
        lambda *indices: output_first(*indices) + output_second(*indices),
        name="DummyYummySweety")
    is_cast = False
    if "cast" in output_second.op.input_tensors[0].name:
        is_cast = True
    # Calculate block split factor by axis_n_size and core_num
    axis_n_size = int(res[0].shape[1])
    if not core_num:
        core_num = int(cceconf.get_soc_spec("CORE_NUM"))
    # Multi core kernel requires aligned output
    element_size = cce_util.get_align_factor(output_first.dtype)[1]
    block_element_num = te.platform.cce_intrin_md.ALIGNMENT_BYTES // element_size
    estimate_block_split_factor = max(axis_n_size // core_num, 8)
    nearest_aligned_factor = estimate_block_split_factor % block_element_num
    # Decrease core_num for aligned output
    if estimate_block_split_factor < block_element_num and core_num > 1:
        return bn_training_reduce_schedule_nd(res, core_num - 1)
    # Round to the nearest
    block_split_factor = estimate_block_split_factor - nearest_aligned_factor
    # Calculate UB split
    ub_size = te.platform.CceProductParams().getParams("Unified_Buffer") // 2
    reduce_data_num = 1
    reduce_data_factor = 2
    if is_cast:
        reduce_data_factor = 3
    for reduce_axis in output_first.op.reduce_axis:
        reduce_data_num *= int(reduce_axis.dom.extent)
    reduce_data_num *= reduce_data_factor
    max_possible_loop = ub_size // (element_size * reduce_data_num)
    actual_loop = 1
    for loop in range(max_possible_loop - 1, 0, -1):
        if block_split_factor % loop == 0:
            actual_loop = loop
            break
    # Force aligned if multi-core is enabled
    if actual_loop < block_element_num and actual_loop < block_split_factor and core_num > 1:
        actual_loop = block_element_num

    # Find all tensors
    if is_cast:
        # With Cast, prepare tensor parameters
        mul_tensor = output_first.op.input_tensors[0]
        cast_tensor = mul_tensor.op.input_tensors[0]
        res_input = cast_tensor.op.input_tensors[0]
        input_tensor_next = [cast_tensor
                             ]  # First compute tensor is cast_tensor
        ub_tensors = [cast_tensor, mul_tensor, output_first, output_second]
    else:
        # Without Cast, prepare tensor parameters
        cast_tensor = None
        mul_tensor = output_first.op.input_tensors[0]
        res_input = mul_tensor.op.input_tensors[0]
        input_tensor_next = [mul_tensor, output_second
                             ]  # First compute tensor is cast_tensor
        ub_tensors = [mul_tensor, output_first, output_second]

    # Create original schedule
    sch = tvm.create_schedule(final_output.op)
    # ////////////////////////////////////
    # ///////// DataFlow Control /////////
    # ////////////////////////////////////
    # Read input in
    input_tensor_ub = sch.cache_read(res_input, cce_params.scope_ubuf,
                                     input_tensor_next)
    ub_tensors.append(input_tensor_ub)
    # Compute procedure in ubuf
    for ub_tens in ub_tensors:
        sch[ub_tens].set_scope(cce_params.scope_ubuf)
    # ////////////////////////////////////
    # //////// Split axis Control ////////
    # ////////////////////////////////////
    outer, inner = \
        sch[final_output].split(sch[final_output].op.axis[1],
                                factor=block_split_factor)
    ub_outer, ub_inner = sch[final_output].split(inner, factor=actual_loop)
    sch[final_output].bind(outer, tvm.thread_axis("blockIdx.x"))
    # ////////////////////////////////////
    # ///////// Compute Control //////////
    # ////////////////////////////////////
    compute_at_axis = ub_outer
    for ub_tens in ub_tensors:
        sch[ub_tens].compute_at(sch[final_output], compute_at_axis)
    # ////////////////////////////////////
    # //////////// EmitInsn //////////////
    # ////////////////////////////////////

    def emit_on_self(tensor, axisnum=0, op='dma_copy'):
        """Do emit insn"""
        sch[tensor].emit_insn(sch[tensor].op.axis[axisnum], op)

    def emit_on_self_ex(tensor, axis, op='dma_copy'):
        """Do emit insn"""
        sch[tensor].emit_insn(axis, op)

    # Fake results
    emit_on_self(input_tensor_ub, 0)
    if is_cast:
        emit_on_self(cast_tensor, 0, cast_tensor.op.tag.split('|')[0])
    emit_on_self(mul_tensor, 0, mul_tensor.op.tag)

    sch[output_first].pragma(sch[output_first].op.axis[1], "emit_insn",
                             "bn_reduce_sum")
    sch[output_second].pragma(sch[output_second].op.axis[1], "emit_insn",
                              "bn_reduce_sum")
    sch[output_first].double_buffer()
    sch[output_second].double_buffer()

    emit_on_self_ex(final_output, ub_inner, "binary_reduce_output_reversed")

    def new_alloc(dtype, shape, name):
        """Alloc mem"""
        new_buffer = tvm.decl_buffer(shape,
                                     dtype,
                                     name=name,
                                     scope="",
                                     data=None)
        return new_buffer

    out_buffer_sec = new_alloc(final_output.dtype, (block_split_factor, ),
                               "reduce_sec_output_gm")
    cce_emitinsn_params.cceEmitParamsIns.insert_param(
        "binary_reduce_output_buffer", out_buffer_sec)
    tensor_list = [res_input, final_output, out_buffer_sec]

    return sch, tensor_list
Esempio n. 9
0
    def __init__(self,
                 input_x,
                 auxiliary_coefficients,
                 auxiliary_offset,
                 kernel_name='stn_compute'):

        self.d_type_x = input_x.get('dtype')
        self.theta_dtype = auxiliary_coefficients.get('dtype')
        self.position_dtype = auxiliary_offset.get('dtype')
        self.shape = input_x.get('shape')

        self.kernel_name = kernel_name

        # product_name = tik_get_soc_name.get_soc_name()
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.ai_core_num = tik.Dprofile().get_aicore_num()

        ub_size_bytes = cce.get_soc_spec(
            cce.cce_conf.UB_SIZE) // 2  # double buffer
        self.d_type_bytes_size = cce.cce_intrin.get_bit_len(self.d_type_x) // 8
        self.theta_type_bytes_size = cce.cce_intrin.get_bit_len(
            auxiliary_coefficients.get('dtype')) // 8
        self.offset_type_bytes_size = cce.cce_intrin.get_bit_len(
            auxiliary_offset.get('dtype')) // 8
        self.vec_compute_size = 256

        # theta size output_h * output_w * 4 * n * c1
        self.theta_size = auxiliary_coefficients.get('shape')[0] * auxiliary_coefficients.get('shape')[1] * \
                          auxiliary_coefficients.get('shape')[2]

        # output_h * output_w
        self.output_hw = self.theta_size // 4 // self.shape[0] // self.shape[1]

        # tiling policy
        self.total_c1 = self.shape[0] * self.shape[1]
        self.ub_tensor_size = 16 if self.shape[1] * self.shape[4] * self.d_type_bytes_size > ub_size_bytes * 0.4 else \
            self.shape[1] * self.shape[4]
        self.input_stride = (self.shape[2] * self.shape[3] * self.shape[4] - self.shape[4]) \
                            * self.d_type_bytes_size // 32
        self.output_stride = (self.output_hw * self.shape[4] -
                              self.shape[4]) * self.d_type_bytes_size // 32
        self.if_skip_read_ceof = self.ub_tensor_size != 16

        # nc1hwc0 c0 = 16 theta type same as input_x
        self.ub_tensor_len = self.ub_tensor_size * self.d_type_bytes_size // 32

        self.input_hw = self.shape[2] * self.shape[3]

        # ub theta size must be a multiple of 4 and 32
        ub_theta_offset_can_use = (ub_size_bytes - self.ub_tensor_size * self.d_type_bytes_size * 2) \
                                  // (self.theta_type_bytes_size + self.offset_type_bytes_size)
        self.ub_theta_offset_size = ub_theta_offset_can_use - ub_theta_offset_can_use % 4

        theta_burst_len = self.ub_theta_offset_size * self.theta_type_bytes_size // 32
        offset_burst_len = self.ub_theta_offset_size * self.offset_type_bytes_size // 32
        self.ub_theta_offset_size = min(
            theta_burst_len * 32 // self.theta_type_bytes_size,
            offset_burst_len * 32 // self.offset_type_bytes_size)

        self.input_num = reduce(lambda x, y: x * y, input_x.get('shape'))
        # self.input_num = self.theta_size * 4

        # input data
        self.input_x_gm = self.tik_instance.Tensor(self.d_type_x,
                                                   (self.input_num, ),
                                                   name='input_x_gm',
                                                   scope=tik.scope_gm)
        # theta matrix
        self.input_theta_gm = self.tik_instance.Tensor(
            auxiliary_coefficients.get('dtype'), (self.theta_size, ),
            name='input_theta_gm',
            scope=tik.scope_gm)
        # position offset matrix
        self.input_position_gm = self.tik_instance.Tensor(
            auxiliary_offset.get('dtype'), (self.theta_size, ),
            name='input_position_gm',
            scope=tik.scope_gm)

        # output data
        self.output_y_gm = self.tik_instance.Tensor(
            self.d_type_x,
            (self.output_hw * self.shape[0] * self.shape[1] * self.shape[4], ),
            name='output_y_gm',
            scope=tik.scope_gm)