Esempio n. 1
0
def _bessel_i0e_compute(input_data):
    """bessel i0e compute"""

    shape_input = input_data.shape
    dtype_input = input_data.dtype

    # chose the type of data in begin
    if dtype_input == "float16":
        input_data = Cast(input_data, "float32", target=utils.CCE)
    abs_data = Abs(input_data, target=utils.CCE)

    # compute bessel_i0e for data in (-3.75, 3.75)
    # t = |x| / 3.75
    # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8
    #       + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75
    broad_const_limit = akg.lang.ascend.broadcast(
        akg.tvm.const(CONST_LIMIT, "float32"), shape_input)
    before_abs_data = minimum(abs_data, broad_const_limit)
    data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT)
    square_data = mul(data, data, target=utils.CCE)
    before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1])
    before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2])
    for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]:
        before_res = mul(before_res, square_data, target=utils.CCE)
        before_res = topi.add(before_res, iter_number)
    exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE)
    before_res = mul(before_res, exp_data, target=utils.CCE)

    # compute bessel_i0e for data in other domain
    # t = |x| / 3.75
    # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3
    #           + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7
    #           + 0.00392377t^-8), |x| >= 3.75
    data = Divide(broad_const_limit, abs_data, target=utils.CCE)
    after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1])
    after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2])
    for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]:
        after_res = mul(after_res, data, target=utils.CCE)
        after_res = topi.add(after_res, iter_number)
    rsqrt_data = rsqrt(abs_data, target=utils.CCE)
    after_res = mul(after_res, rsqrt_data, target=utils.CCE)
    after_res = minimum(before_res, after_res, target=utils.CCE)

    # chose the type of data in end
    if dtype_input == "float16":
        after_res = Cast(after_res, "float16", target=utils.CCE)

    return after_res
Esempio n. 2
0
def fused_gather_nd_reduce_sum_mul_unsorted_segment_sum(input1, input2, input3, input4, input5, axis=0, keepdims=False, num=0, target=utils.CUDA):
    item_get = gather_nd(input1, input2)
    sum_axis = reduce_sum(item_get, axis, keepdims, target)
    prod = mul(sum_axis, input3, target=utils.CUDA)
    res1 = unsorted_segment_sum(prod, input4, num, op_id=0)
    res2 = unsorted_segment_sum(prod, input5, num, op_id=1)
    return res1, res2
Esempio n. 3
0
def _bessel_i1e_compute(input_data):
    """bessel i1e compute"""

    shape = utils.get_shape(input_data)
    dtype = input_data.dtype

    # chose the type of data in begin
    if dtype == "float16":
        input_data = Cast(input_data, "float32", target=utils.CCE)

    abs_data = Abs(input_data, utils.CCE)
    # compute bessel_i1e for data in (-3.75, 3.75)
    before_res = _before_res_compute(abs_data)
    # compute bessel_i1e for data in other domain
    after_res = _after_res_compute(abs_data)

    # As vcmp_lt and vsel instruction don't support fp32 on mini
    # It can be simplified by some methods, such as , "auto cast"
    if product_is_mini():
        res = akg.tvm.compute(
            shape, lambda *indice: akg.tvm.expr.Select(
                abs_data[indice].astype("float16") < akg.tvm.const(
                    CONST_LIMIT, "float16"), before_res[indice].astype(
                        "float16"), after_res[indice].astype("float16")))
        res = Cast(res, "float32", target=utils.CCE)
    else:
        res = akg.tvm.compute(
            shape,
            lambda *indice: akg.tvm.expr.Select(abs_data[
                indice] < CONST_LIMIT, before_res[indice], after_res[indice]))
    data_sign = Sign(input_data, target=utils.CCE)
    res = mul(res, data_sign, target=utils.CCE)
    if dtype == "float16":
        res = Cast(res, "float16", target=utils.CCE)
    return res
Esempio n. 4
0
def matmul_mul(x,
               y,
               c,
               b,
               out_dtype,
               left_format="zZ",
               right_format="nZ",
               out_format="zN",
               transpose_x=False,
               transpose_y=False,
               attrs=None,
               target="cce"):
    matmul_res, attrs = matmul(x,
                               y,
                               b,
                               out_dtype,
                               left_format,
                               right_format,
                               out_format,
                               transpose_x,
                               transpose_y,
                               attrs=attrs)
    attr = {}
    print(matmul_res.shape)
    res = mul(matmul_res, c, target=target)
    return res, attrs
Esempio n. 5
0
def mul_mean(first_input,
             second_input,
             axis=None,
             keepdims=False,
             target="cce"):
    temp = mul(first_input, second_input, target=target)
    output, _ = mean(temp, axis, keepdims)
    return output
Esempio n. 6
0
def mul_conv(data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias=False,
             block_size=16, attrs=None, target="cce"):
    a1 = data[0]
    a2 = data[1]
    b = data[2]
    a = mul(data[0], data[1], target=target)
    if use_bias:
        conv_data = [a, b, data[3]]
    else:
        conv_data = [a, b]
    res = Conv(conv_data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias, block_size, attrs)
    return res
Esempio n. 7
0
def _after_res_compute(abs_data):
    """
    compute bessel_i1e for abs value of data greater than or equal to 3.75

    Algrithm:
    t = 3.75 / x
    I1(x) = (1 / sqrt(x))*(0.39894228 - 0.03988024t - 0.00362018t^2
                           + 0.00163801t^3 - 0.01031555t^4 + 0.02282967t^5
                           - 0.02895312t^6 + 0.01787654t^7 - 0.00420059t^8)
    """
    broad_const_limit = akg.lang.ascend.broadcast(
        akg.tvm.const(CONST_LIMIT, abs_data.dtype), abs_data.shape)
    data = Divide(broad_const_limit, abs_data, target=utils.CCE)
    after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1])
    after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2])
    for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]:
        after_res = mul(after_res, data, target=utils.CCE)
        after_res = topi.add(after_res, iter_number)
    abs_data_rsqrt = rsqrt(abs_data, target=utils.CCE)
    after_res = mul(after_res, abs_data_rsqrt, target=utils.CCE)
    return after_res
Esempio n. 8
0
def _before_res_compute(abs_data):
    """
    compute bessel_i1e for abs value of data less than or equal to 3.75

    Algrithm:
    t = x / 3.75
    I1(x) = e^-|x|*x*(0.5 + 0.87890594t^2 + 0.51498869t^4 + 0.15084934t^6
                    + 0.02658773t^8 + 0.00301532t^10 + 0.00032411t^12)
    """

    data = topi.multiply(abs_data, 1.0 / CONST_LIMIT)
    data_square = mul(data, data, target=utils.CCE)
    before_res = topi.multiply(data_square, ITR_BEFORE[LEN_BEFORE - 1])
    before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2])
    for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]:
        before_res = mul(before_res, data_square, target=utils.CCE)
        before_res = topi.add(before_res, iter_number)
    exp_value = Exp(neg(abs_data, target=utils.CCE), target=utils.CCE)
    before_res = mul(before_res, exp_value, target=utils.CCE)
    before_res = mul(before_res, abs_data, target=utils.CCE)
    return before_res
Esempio n. 9
0
def sigmoid_cross_entropy_with_logits(labels=None, logits=None, target="cce"):
    ##
    # \brief Computes sigmoid cross entropy given `logits`.
    #
    # \f[
    #   cost = lables * -log(sigmoid(logits)) + (1 - lables) * -log(1 - sigmoid(logits))
    # \f]
    # \param labels akg.tvm.Tensor of the same type and shape as `logits`.
    # \param  logits akg.tvm.Tensor of type float16, float32
    #
    # \return akg.tvm.Tensor of the same shape as `logits` with the componentwise logistic losses.
    ##

    if get_shape(logits) != get_shape(labels):
        raise ValueError(
            "logits and labels must have the same shape  (%s vs %s)" %
            (get_shape(logits), get_shape(labels)))
    if logits.dtype != labels.dtype:
        raise ValueError(
            "logits and labels must have the same dtype  (%s vs %s)" %
            (logits.dtype, labels.dtype))

    shape = logits.shape
    dtype = logits.dtype

    check_list = ["float16", "float32"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "sigmoid_cross_entropy_with_logits only support %s while dtype is %s"
            % (",".join(check_list), dtype))

    #    z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
    # =  z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
    # =  max(x, 0) - x * z + log(1 + exp(-abs(x)))

    zero = akg.tvm.const(0, dtype=dtype)
    relu_logits = akg.tvm.compute(
        shape,
        lambda *indice: akg.tvm.expr.Select(
            logits(*indice) < zero, zero, logits(*indice)),
        name="relu_logits")
    neg_abs_logits = akg.tvm.compute(
        shape,
        lambda *indice: akg.tvm.expr.Select(
            logits(*indice) < zero, logits(*indice),
            logits(*indice) * -1),
        name="neg_abs_logits")
    sigmoid_logits = Exp(neg_abs_logits, target=target) + akg.tvm.const(
        1, dtype=dtype)
    ln_sigmoid_logits = log(sigmoid_logits, target=target)
    logits_mul_lables = mul(logits, labels, target=target)
    res = relu_logits - logits_mul_lables + ln_sigmoid_logits
    return res
Esempio n. 10
0
def mul_unsortedsegmentsum(input1,
                           input2,
                           ids_tensor,
                           num_segments,
                           target="cce"):
    import akg.tvm
    temp = mul(input1, input2, target='cce')
    output = unsorted_segment_sum(temp,
                                  ids_tensor,
                                  num_segments,
                                  target=target)[0]
    output = akg.tvm.compute(output.shape, lambda *i: output(*i),
                             "fused_mul_unsorted")
    return output
Esempio n. 11
0
def fake_quant_with_min_max_vars_per_channel_compute(input_data,
                                                     input_min,
                                                     input_max,
                                                     num_bits=8,
                                                     narrow_range=False):
    """fake_quant_with_min_max_vars_per_channel compute implemention"""
    shape = get_shape(input_data.shape)
    dtype = input_data.dtype
    min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype)
    max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype)
    # get nudged_min and nudged_max by nudged_min_max_compute function
    nudged_min_nudged_max = nudged_min_max_compute(min_broadcast,
                                                   max_broadcast, num_bits,
                                                   narrow_range)
    # transform the input between nudged_max and nudged_min
    clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1])
    clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0])

    # calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0])
    if product_is_mini():
        clamped_shifted_div_scale = mul(clamped_shifted,
                                        reciprocal(nudged_min_nudged_max[2]),
                                        target=utils.CCE)
    else:
        clamped_shifted_div_scale = Divide(clamped_shifted,
                                           nudged_min_nudged_max[2],
                                           target=utils.CCE)
    result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype))
    floor_result_tmp = akg.lang.ascend.floor(result_tmp)
    if product_is_mini():
        floor_result_tmp = topi.cast(floor_result_tmp, "float16")

    floor_result_tmp = topi.cast(floor_result_tmp, "float32")
    scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2])
    tmp_res = topi.add(scale_product, nudged_min_nudged_max[0])
    # get bool_both_zero_value by bool_both_zero_compute function
    bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast)
    res = topi.multiply(tmp_res, bool_both_zero_value)

    return res
Esempio n. 12
0
def fused_gather_gather_add_mul_max_exp_scatter_add(inp1,
                                                    inp2,
                                                    inp3,
                                                    inp4,
                                                    axis,
                                                    target=utils.CUDA):
    ndim = len(inp1.shape)
    axis = axis + ndim if axis < 0 else axis
    assert axis >= 0
    assert axis < ndim

    gather_out1 = gather(inp1, inp2, axis, "1")
    gather_out2 = gather(inp1, inp2, axis, "2")

    add_out = Add(gather_out1, gather_out2, target=target)
    mul_out = mul(add_out, inp3, utils.CUDA)
    max_out = maximum(add_out, mul_out, utils.CUDA)
    exp_out = Exp(max_out, utils.CUDA)
    scatter_out = scatter_add(inp1, inp4, exp_out)

    return exp_out, scatter_out
Esempio n. 13
0
def matmul_mul_transdata(x,
                         y,
                         c,
                         b,
                         out_dtype,
                         left_format="zZ",
                         right_format="nZ",
                         out_format="zN",
                         transpose_x=False,
                         transpose_y=False,
                         attrs=None,
                         target="cce"):
    matmul_res, attrs = matmul(x,
                               y,
                               b,
                               out_dtype,
                               left_format,
                               right_format,
                               out_format,
                               transpose_x,
                               transpose_y,
                               attrs=attrs)
    res = mul(matmul_res, c, target=target)
    if out_format == 'zN':
        n1, m1, m0, n0 = matmul_res.shape[-4:]
        new_shape = matmul_res.shape[:-4] + [m1 * m0, n1 * n0]
    elif out_format == 'zZ':
        m1, n1, m0, n0 = matmul_res.shape[-4:]
        new_shape = matmul_res.shape[:-4] + [m1 * m0, n1 * n0]

    func = akg.tvm.get_global_func("TransData")
    res = func(
        [res], {
            "src_format": "FRACTAL_NZ",
            "dst_format": "DefaultFormat",
            "output_shape": new_shape
        })
    return res, attrs
Esempio n. 14
0
def div_no_nan(data_x, data_y, target=utils.CCE):
    """
    Returns 0 if the denominator is zero, else, like Div.

    Args:
        data_x (tvm.tensor.Tensor): tensor with type int32/int8/uint8, float16/float32.
        data_y (tvm.tensor.Tensor): tensor with type int32/int8/uint8, float16/float32.

    Returns:
        tvm.tensor.Tensor.
    """
    dtype = data_x.dtype
    if dtype != data_y.dtype:
        raise TypeError("input dtype should be the same")
    utils.ops_dtype_check(dtype, [utils.DtypeForDavinci.ALL_FLOAT, 
                                    utils.DtypeForDavinci.INT8,
                                    utils.DtypeForDavinci.UINT8, 
                                    utils.DtypeForDavinci.INT32])

    utils.check_shape(data_x.shape)
    utils.check_shape(data_y.shape)
    utils.auto_broadcast_check(data_x, data_y)

    # dtype for vsel and vcmp
    if product_is_mini():
        compute_dtype = "float16"
    else:
        compute_dtype = "float32"
 
    # div fp16 y returns 0 if y < 2^-12
    # div fp32 y returns 0 if y < 2^-64
    min_val = tvm.const(2**(-12) if product_is_mini() else 2**(-64),
                        dtype=compute_dtype)    

    tvm_one = tvm.const(1, dtype=compute_dtype)
    tvm_zero = tvm.const(0, dtype=compute_dtype)
    
    if not product_is_mini() and dtype == "float16":
        min_val = tvm.const(2**(-12), "float32")

    data_y_fp32 = akg.lang.ascend.cast_to(data_y, "float32")
    # avoid when y > 2^15 cast from fp32 to fp16 in mini
    clip_y_fp32 = akg.topi.clip(data_y_fp32, -1.0, 1.0)
    abs_clip_y_fp32 = Abs(clip_y_fp32, target)
    y_cmp = akg.lang.ascend.cast_to(abs_clip_y_fp32, compute_dtype) 

    is_zero = tvm.compute(data_y.shape,
                          lambda *i : tvm.expr.Select(
                              y_cmp(*i) < min_val, tvm_one, tvm_zero), 
                          name="is_zero")    
    
    # if fp32 y < 2^-24, cast(y,fp16)==0. to find y in (2^-64, 2^-24): 
    if product_is_mini() and dtype == "float32":
        is_zero = _refine_is_zero(is_zero, abs_clip_y_fp32)
    
    is_zero = akg.lang.ascend.cast_to(is_zero, "float32")
    not_zero = tvm.compute(data_y.shape,
                           lambda *i : (1 - is_zero(*i)).astype("float32"),
                           name="not_zero")    
   
    # replace [x1 x2]/[y1 0] by [x1 0]/[y1 1] 
    data_x = mul(akg.lang.ascend.cast_to(data_x, "float32"), not_zero, target=target)
    data_y = akg.lang.ascend.cast_to(data_y, "float32") + is_zero
    res = Divide(data_x, data_y, target=target)

    if dtype in ("int8", "uint8", "int32"):
        res = akg.lang.ascend.floor(res)
        res = akg.lang.ascend.cast_to(res, dtype)
    else:
        res = akg.lang.ascend.cast_to(res, dtype)
    return res
Esempio n. 15
0
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits,
                           narrow_range):
    """
    Calculate the maximum and minimum values of the quantization.

    Notes:
        Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min).
        Then compute nudged_zero_point:
                nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float,
        between_min_max_float is first calculated by:
                zero_point_from_min = (quant_min_float - min_broadcast) / scale,
        then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast.
        Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float,
        the value is quant_min, else is 0. The same as more_quant_max_float.
        Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max:
                 nudged_min = (quant_min - nudged_zero_point) * scale
                 nudged_max = (quant_max - nudged_zero_point) * scale

    Args:
        min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel.
        max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel.
        num_bits (int): num_bits is the bitwidth of the quantization, range [2,16].
        narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else
                      quantized into the quantization range [1, 2^num_bits - 1].

    Returns:
        nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast.
        nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast.
        scale (tvm.tensor.Tensor): The same type and shape as max_broadcast.
    """

    dtype = min_broadcast.dtype
    quant_min = 1 if narrow_range else 0
    quant_max = (2**num_bits) - 1

    # because of need compute each channel, so quant_min and quant_max need to broadcast.
    quant_min_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_min, dtype))
    quant_max_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_max, dtype))

    # caculate each channel max and min difference.
    max_sub_min = topi.subtract(max_broadcast, min_broadcast)
    quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float)
    # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min)
    # and min_div_scale = min_broadcast / scale
    if product_is_mini():
        scale = mul(max_sub_min,
                    reciprocal(quant_max_sub_quant_min),
                    target=utils.CCE)
        min_div_scale = Mul(min_broadcast, reciprocal(scale), target=utils.CCE)
    else:
        scale = Divide(max_sub_min, quant_max_sub_quant_min, target=utils.CCE)
        min_div_scale = Divide(min_broadcast, scale, target=utils.CCE)

    # zero_point_from_min = quant_min_float - min_broadcast / scale
    zero_point_from_min = topi.subtract(quant_min_float, min_div_scale)
    # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0
    bool_less_quant_min_float = less_compare_float32(zero_point_from_min,
                                                     quant_min_float)
    # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0
    bool_more_quant_max_float = less_compare_float32(quant_max_float,
                                                     zero_point_from_min)

    # according to above bool param to select effective value
    less_quant_min_float = topi.multiply(quant_min_float,
                                         bool_less_quant_min_float)
    more_quant_max_float = topi.multiply(quant_max_float,
                                         bool_more_quant_max_float)

    # compute which num is not less than quant_min_float and not large than quant_max_float
    tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype))
    bool_not_less_quant_min_float = topi.subtract(tensor_one,
                                                  bool_less_quant_min_float)
    bool_not_more_quant_max_float = topi.subtract(tensor_one,
                                                  bool_more_quant_max_float)
    bool_between_min_max = topi.multiply(bool_not_less_quant_min_float,
                                         bool_not_more_quant_max_float)
    between_min_max_float = topi.multiply(zero_point_from_min,
                                          bool_between_min_max)
    # add 0.5 to num which min <= num <= max and then floor them.
    between_min_max_add_half_one = topi.add(between_min_max_float,
                                            dc.half_const(dtype))
    between_min_max_round = akg.lang.ascend.floor(between_min_max_add_half_one)
    if product_is_mini():
        between_min_max_round = topi.cast(between_min_max_round, "float16")

    between_min_max_round = topi.cast(between_min_max_round, "float32")

    # calculate the maximum and minimum values of the quantization
    nudged_zero_point_tmp = topi.add(less_quant_min_float,
                                     more_quant_max_float)
    nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round)

    nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point)
    nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point)
    nudged_min = topi.multiply(nudged_min_tmp, scale)
    nudged_max = topi.multiply(nudged_max_tmp, scale)
    res = [nudged_min, nudged_max, scale]

    return res
Esempio n. 16
0
def mul_ad(head, a, b):
    output = mul(a, b, target=utils.CCE)
    jacs_ = list(akg.differentiate(output, [a], head))
    return jacs_[0]
Esempio n. 17
0
def mul(x, y, target=utils.CUDA):
    """Mul"""
    return math.mul(x, y, target)
Esempio n. 18
0
def mul_sub_mutioutput(first_input, second_input, third_input, target="cce"):
    temp = mul(first_input, second_input, target=target)
    output = sub(temp, third_input, target=target)
    return [temp, output]