Exemple #1
0
def _bessel_i1e_compute(input_data):
    """bessel i1e compute"""

    shape = utils.get_shape(input_data)
    dtype = input_data.dtype

    # chose the type of data in begin
    if dtype == "float16":
        input_data = Cast(input_data, "float32", target=utils.CCE)

    abs_data = Abs(input_data, utils.CCE)
    # compute bessel_i1e for data in (-3.75, 3.75)
    before_res = _before_res_compute(abs_data)
    # compute bessel_i1e for data in other domain
    after_res = _after_res_compute(abs_data)

    # As vcmp_lt and vsel instruction don't support fp32 on mini
    # It can be simplified by some methods, such as , "auto cast"
    if product_is_mini():
        res = akg.tvm.compute(
            shape, lambda *indice: akg.tvm.expr.Select(
                abs_data[indice].astype("float16") < akg.tvm.const(
                    CONST_LIMIT, "float16"), before_res[indice].astype(
                        "float16"), after_res[indice].astype("float16")))
        res = Cast(res, "float32", target=utils.CCE)
    else:
        res = akg.tvm.compute(
            shape,
            lambda *indice: akg.tvm.expr.Select(abs_data[
                indice] < CONST_LIMIT, before_res[indice], after_res[indice]))
    data_sign = Sign(input_data, target=utils.CCE)
    res = mul(res, data_sign, target=utils.CCE)
    if dtype == "float16":
        res = Cast(res, "float16", target=utils.CCE)
    return res
Exemple #2
0
def pad(data, paddings, padtype, target="cce"):
    """add paddings to the tensor
    :shape: The shape of the tensor, now only support two dimension Tensor
    :paddings: The shape of the paddings, shape [N,2], N is the dimension of the tensor,
     For each dimension D of input, paddings[D, 0] indicates how many values to add before
     the contents of tensor in that dimension, and paddings[D, 1] indicates how many values to
     add after the contents of tensor in that dimension.
    :dtype: The type of the input, float16, float32
    :padtype: One of "CONSTANT", "REFLECT", or "SYMMETRIC".
    """
    # check shape
    utils.check_shape(data.shape)
    # check types
    utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_TYPES)
    # check padding types
    ptype_checklist = ['constant']
    if not (padtype in ptype_checklist):
        raise RuntimeError("pad_cce only support %s while padtype is %s" % (",".join(ptype_checklist), padtype))

    dtype = data.dtype
    if dtype == 'int8' or dtype == 'uint8':
        data = Cast(data, "float16", target=target)

    rank = len(data.shape)
    pad_before = []
    pad_after = []
    for i in range(rank):
        pad_before.append(paddings[i][0])
        pad_after.append(paddings[i][1])
    B = tvm_pad(data, pad_before, pad_after=pad_after, name='B')

    if dtype == 'int8' or dtype == 'uint8':
        B = Cast(B, dtype, target=target)
    return B
Exemple #3
0
def scale(input_data, scale_data, target="cce"):
    """
    Computes scaled input_data, res = input_data * scale_data

    Args:
        input_data(akg.tvm.Tensor): Tensor of type float16, float32, int8, uint8, int32.
        scale_data(akg.tvm.Tensor): Tensor of same type as input_data, if shape(scale_data) != shape(input_data),
                                    the shape of scale_data will broadcast to shape(input_data).

    Returns:
        akg.tvm.Tensor of same type and shape as input_data
    """

    # check shape
    input_data_shape = [x.value for x in input_data.shape]
    scale_shape = [x.value for x in scale_data.shape]
    utils.check_shape(input_data_shape)
    utils.check_shape(scale_shape)

    # check type
    check_list = ["float16", "float32", "int8", "uint8", "int32"]
    dtype = input_data.dtype
    if not dtype in check_list:
        raise TypeError(
            "scale_data operator only supports %s while dtype is %s" %
            (",".join(check_list), dtype))
    if scale_data.dtype != dtype:
        raise TypeError(
            "type(input_data) is %s, type(scale_data) is %d, which is inconsistent"
            % (dtype, scale_data.dtype))

    orig_dtype = dtype
    if dtype == "int8" or dtype == "uint8":
        dtype = "float16"
    if dtype == "int32":
        dtype = "float32"
    if dtype != orig_dtype:
        input_data = Cast(input_data, dtype, target=utils.CCE)
        scale_data = Cast(scale_data, dtype, target=utils.CCE)

    if scale_shape != input_data_shape:
        scale_data = akg.topi.broadcast_to(scale_data, input_data_shape)

    res = akg.tvm.compute(
        input_data_shape,
        lambda *indice: input_data(*indice) * scale_data(*indice),
        name="res")

    if res.dtype != orig_dtype:
        res = Cast(res, orig_dtype, target=utils.CCE)

    return res
Exemple #4
0
def scale_bias(input_data, scale_data, bias_data, target="cce"):
    """
    Adds bias_data on scaled input_data, res = input_data * scale_data + bias_data

    Args:
        input_data(akg.tvm.Tensor): Tensor of type float16, float32, int8, uint8, int32.
        scale_data(akg.tvm.Tensor): Tensor of same type as input_data, if shape(scale_data) != shape(input_data),
                                    the shape of scale_data will broadcast to shape(input_data).
        bias_data(akg.tvm.Tensor): Tensor of same type as input_data, if shape(bias_data) != shape(input_data),
                                   the shape of bias_data will broadcast to shape(input_data).

    Returns:
        akg.tvm.Tensor of same type and shape as input_data.
    """

    # check shape
    input_data_shape = [x.value for x in input_data.shape]
    bias_shape = [x.value for x in bias_data.shape]
    utils.check_shape(bias_shape)

    # check type
    if bias_data.dtype != input_data.dtype:
        raise RuntimeError(
            "type(input_data) is %s, type(bias_data) is %d, which is inconsistent"
            % (input_data.dtype, bias_data.dtype))

    scale_input_data = scale(input_data, scale_data)

    dtype = bias_data.dtype
    orig_dtype = dtype
    if dtype == "int8" or dtype == "uint8":
        dtype = "float16"
    if dtype == "int32":
        dtype = "float32"
    if dtype != orig_dtype:
        scale_input_data = Cast(scale_input_data, dtype, target=utils.CCE)
        bias_data = Cast(bias_data, dtype, target=utils.CCE)

    if bias_shape != input_data_shape:
        bias_data = akg.topi.broadcast_to(bias_data, input_data_shape)

    res = akg.tvm.compute(
        input_data_shape,
        lambda *indice: scale_input_data(*indice) + bias_data(*indice),
        name="res_bias")

    if res.dtype != orig_dtype:
        res = Cast(res, orig_dtype, target=utils.CCE)

    return res
Exemple #5
0
def _bessel_i0e_compute(input_data):
    """bessel i0e compute"""

    shape_input = input_data.shape
    dtype_input = input_data.dtype

    # chose the type of data in begin
    if dtype_input == "float16":
        input_data = Cast(input_data, "float32", target=utils.CCE)
    abs_data = Abs(input_data, target=utils.CCE)

    # compute bessel_i0e for data in (-3.75, 3.75)
    # t = |x| / 3.75
    # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8
    #       + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75
    broad_const_limit = akg.lang.ascend.broadcast(
        akg.tvm.const(CONST_LIMIT, "float32"), shape_input)
    before_abs_data = minimum(abs_data, broad_const_limit)
    data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT)
    square_data = mul(data, data, target=utils.CCE)
    before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1])
    before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2])
    for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]:
        before_res = mul(before_res, square_data, target=utils.CCE)
        before_res = topi.add(before_res, iter_number)
    exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE)
    before_res = mul(before_res, exp_data, target=utils.CCE)

    # compute bessel_i0e for data in other domain
    # t = |x| / 3.75
    # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3
    #           + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7
    #           + 0.00392377t^-8), |x| >= 3.75
    data = Divide(broad_const_limit, abs_data, target=utils.CCE)
    after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1])
    after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2])
    for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]:
        after_res = mul(after_res, data, target=utils.CCE)
        after_res = topi.add(after_res, iter_number)
    rsqrt_data = rsqrt(abs_data, target=utils.CCE)
    after_res = mul(after_res, rsqrt_data, target=utils.CCE)
    after_res = minimum(before_res, after_res, target=utils.CCE)

    # chose the type of data in end
    if dtype_input == "float16":
        after_res = Cast(after_res, "float16", target=utils.CCE)

    return after_res
Exemple #6
0
def truncate_div_compute(input_x1, input_x2):
    """compute for truncate_div"""
    int_list = ("int32", "int8", "uint8")

    if input_x1.dtype in int_list:
        data_zero = dc.zero_const("float32")
        data_x_broad = Cast(input_x1, "float32", target=utils.CCE)
        data_y_broad = Cast(input_x2, "float32", target=utils.CCE)
        res_div = topi.divide(data_x_broad, data_y_broad)
        res_min_int = ceil(topi.minimum(res_div, data_zero))
        res_max_int = floor(topi.maximum(res_div, data_zero))
        res_trunc = topi.add(res_min_int, res_max_int)
        res_trunc = Cast(res_trunc, "float32", target=utils.CCE)
    else:
        res_trunc = topi.divide(input_x1, input_x2)

    return Cast(res_trunc, input_x1.dtype, target=utils.CCE)
Exemple #7
0
 def truncatemod_func(a, b):
     """function for truncatemod formula"""
     # For positive numbers, floor and trunc are equivalent
     return akg.topi.subtract(
         a,
         akg.topi.multiply(
             b,
             Cast(floor(Divide(a, b, utils.CCE)),
                  b.dtype,
                  target=utils.CCE)))
Exemple #8
0
def truncatemod(x, y, target=utils.CCE):
    """
    Computes remainder of division(x / y).

    Note:
        res = x - y*trunc(x/y)

    Args:
        x(tvm.tensor.Tensor): Input tensor, support float16 on mini device, while support
                           int32, int8, uint8, float16, float32 on cloud ones.
        y(tvm.tensor.Tensor): Tensor with same type as input tensor x.

    Returns:
        tvm.tensor.Tensor of same type as input tensors.
    """

    utils.check_shape(x)
    utils.check_shape(y)
    utils.elemwise_dtype_check(x.dtype, y.dtype)
    dtype = x.dtype
    support_dtype = [
        utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32,
        utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.UINT8
    ]
    if product_is_mini():
        support_dtype = [utils.DtypeForDavinci.FLOAT16]

    utils.ops_dtype_check(dtype, support_dtype)

    if not product_is_mini():
        # The high precision compute is required.
        # For brevity, lex x = 132.05, y = 131.95; x and y are very close, but the difference between trunc(x)=132
        # and trunc(y)=131 is 1
        if dtype != "float32":
            x = Cast(x, "float32", target=target)
            y = Cast(y, "float32", target=target)
        res = akg.topi.mod(x, y)
    else:
        res = _truncatemod_compute_mini(x, y)

    if res.dtype != dtype:
        res = Cast(res, dtype, target=target)
    return res
Exemple #9
0
def bitwise_not(data, target=utils.CCE):
    """
    Bitwise-not.

    Args:
        data (tvm.tensor.Tensor): Input data of type int8 or int32.

    Returns:
        tvm.tensor.Tensor, Bitwise-not result.
    """
    utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_INT)
    utils.check_shape(data.shape)

    one = akg.tvm.const(1, dtype=data.dtype)
    minus_one = akg.tvm.const(-1, dtype=data.dtype)
    add_one = akg.lang.ascend.vadds(data, one)
    multiply_one = akg.lang.ascend.vmuls(add_one, minus_one)
    res = Cast(multiply_one, data.dtype, target=target)
    return res
Exemple #10
0
def ones_like(input):
    """
    Generate an array of ones.

    Args:
        input (tvm.tensor.Tensor): Tensor,Should be of type float16, float32, int32, uint8, int8.

    Returns:
        tvm.tensor.Tensor with the same type and shape as input.
    """
    dtype = input.dtype
    shape = get_shape(input)
    utils.ops_dtype_check(dtype, [utils.DtypeForDavinci.ALL_TYPES])
    utils.check_shape(shape)
    res = akg.tvm.compute(shape,
                          lambda *i: akg.tvm.const(1, "float16"),
                          name="res",
                          attrs={'no_inline': 1})
    res = Cast(res, dtype, target=utils.CCE)
    return res
Exemple #11
0
def cast_conv(data,
              fmap_shape,
              filter_shape,
              pad_,
              stride_,
              dilation_,
              use_bias=False,
              block_size=16,
              attrs=None):
    a = data[0]
    data[1].dtype = 'float32'
    b = Cast(data[1], 'float16', target='cce')
    if use_bias:
        conv_data = [a, b, data[2]]
    else:
        conv_data = [a, b]
    # mmad fp32 failed in post_fusing
    res, _ = conv_core(conv_data, fmap_shape, filter_shape, pad_, stride_,
                       dilation_, use_bias, block_size, attrs)
    return res, {}
Exemple #12
0
 def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array):
     data = inputs[0]
     shape = get_shape(data)
     if len(get_shape(data)) == 2:
         # add an extra stage to avoid alignment problem
         min_input = akg.tvm.compute(data.shape,
                                     lambda *i: data(*i),
                                     name="min_input")
         min_ = akg.lang.ascend.reduce_min(min_input,
                                           axis=-1,
                                           keepdims=True)
         min_broadcast = akg.lang.ascend.broadcast(min_, shape)
         if dtype != "float16":
             data = Cast(data, "float16", target=utils.CCE)
         return [
             akg.tvm.compute(shape,
                             lambda i, j: akg.tvm.expr.Select(
                                 data[i, j] == min_broadcast[i, j], grad[i],
                                 akg.tvm.const(0, dtype="float16")),
                             name="reduce_min_ad2")
         ]
Exemple #13
0
 def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
     data_ = inputs[0]
     shape = data_.shape
     # reduces maximum value for each column
     max_ = akg.lang.ascend.reduce_max(data_, axis=axis, keepdims=True)
     # copies reduced values to get the original shape
     max_broadcast = akg.lang.ascend.broadcast(max_, shape)
     # head broadcast is needed to generate correct cce code for the selection operation
     head_broadcast = akg.tvm.compute(
         shape, lambda *indices: head_(*get_reduced_indices(
             *indices, axis=axis, keepdims=keepdims)))
     # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output
     max_values_and_zeros = akg.tvm.compute(
         shape,
         lambda *indices: akg.tvm.expr.Select(
             data_(*indices) == max_broadcast(*indices),
             head_broadcast(*indices), akg.tvm.const(0, dtype='float16')),
         name="reduce_max_ad2")
     # cast data back to the original dtype
     if dtype != 'float16':
         return [Cast(max_values_and_zeros, dtype, target=utils.CCE)]
     else:
         return [max_values_and_zeros]
Exemple #14
0
def fused_minimum_or_maximum_grad(dz, x, y, grad_x, grad_y, op_type):
    """
    Gradient for minimum or maximum operation between two input tensors `x` and `y`.

    Args:
        dz (tvm.tensor.Tensor): Type float16, float32, int32.
        x (tvm.tensor.Tensor): Type float16, float32, int32.
        y (tvm.tensor.Tensor): Type float16, float32, int32.
        grad_x (bool): Whether calculate dx.
        grad_y (bool): Whether calculate dy.
        op_type (str): The type of the op, "GE" for MaximumGrad or "LE" for MinimumGrad.

    Note:
        At least one of grad_x and grad_y is True.

    Returns:
        dx, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_x is True.
        dy, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_y is True.
    """
    utils.check_shape(x)
    utils.check_shape(y)
    utils.check_shape(dz)
    utils.ops_dtype_check(
        [x.dtype, y.dtype, dz.dtype],
        [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32])

    utils.broadcast_check(x, dz)
    utils.broadcast_check(y, dz)

    # check op types
    check_list = ["GE", "LE"]
    if op_type not in check_list:
        raise ValueError(
            "FusedMinimumOrMaximumGrad only support %s while op type is %s" %
            (",".join(check_list), op_type))

    if not grad_x and not grad_y:
        raise ValueError("At least one of grad_x and grad_y is True.")

    x_shape = get_shape(x)
    y_shape = get_shape(y)
    dz_shape = get_shape(dz)
    ori_dtype = dz.dtype

    # get greater compute
    x = akg.lang.ascend.broadcast(x, dz_shape)
    y = akg.lang.ascend.broadcast(y, dz_shape)

    if product_is_mini() and ori_dtype != "float16":
        x = Cast(x, "float16", "cce")
        y = Cast(y, "float16", "cce")
        dz = Cast(dz, "float16", "cce")
    elif ori_dtype == "int32":
        x = Cast(x, "float32", "cce")
        y = Cast(y, "float32", "cce")
        dz = Cast(dz, "float32", "cce")
    zero = zero_const(dz.dtype)

    if op_type == "LE":
        dx = tvm.compute(dz_shape,
                         lambda *i: tvm.expr.Select(
                             (x(*i) <= y(*i)), dz(*i), zero),
                         name='dx')
        dy = topi.subtract(dz, dx)
    elif op_type == "GE":
        dx = tvm.compute(dz_shape,
                         lambda *i: tvm.expr.Select(
                             (x(*i) >= y(*i)), dz(*i), zero),
                         name='dx')
        dy = topi.subtract(dz, dx)

    if dx.dtype == "float16":
        # cast to fp32 for higher precision of reduce_sum.
        if get_shape(dx) != x_shape:
            dx = Cast(dx, "float32", "cce")
        if get_shape(dy) != y_shape:
            dy = Cast(dy, "float32", "cce")

    dx = sum_by_shape(dx, x_shape)
    dy = sum_by_shape(dy, y_shape)

    if ori_dtype != dx.dtype:
        dx = Cast(dx, ori_dtype, "cce")
    if ori_dtype != dy.dtype:
        dy = Cast(dy, ori_dtype, "cce")

    attrs = get_default_attrs()
    if grad_x and grad_y:
        return dx, dy, attrs
    if grad_x:
        return dx, attrs
    return dy, attrs
Exemple #15
0
def cross(x, y, target=utils.CCE):
    """
    Compute cross product of x and y.

    Note:
        The first dim of x or y must be 3, it will be calculated as (two dims for example)
        .. math::
            res = x \\times y = \\left[ \\begin{matrix}
            l, & \\cdots \\\\ m, & \\cdots \\\\ n, & \\cdots
            \\end{matrix} \\right] \\times \\left[ \\begin{matrix}
            o, & \\cdots \\\\ p, & \\cdots \\\\ q, & \\cdots
            \\end{matrix} \\right] = \\left[ \\begin{matrix}
            mq-np, & \\cdots \\\\ no-lq, & \\cdots \\\\ lp-mo, & \\cdots \\\\
            \\end{matrix} \\right]

    Args:
        x (tvm.tensor.Tensor): Input tensor, only support float16, float32,
                               int32, int8, uint8.
        y (tvm.tensor.Tensor): Input tensor, must have the same shape and dtype
                               as x.

    Returns:
        A tvm.tensor.Tensor with the same shape and dtype as x.
    """
    utils.elemwise_shape_check(get_shape(y), get_shape(x))
    utils.elemwise_dtype_check(
        y.dtype, x.dtype,
        (utils.DtypeForDavinci.ALL_FLOAT) if product_is_mini() \
            else (utils.DtypeForDavinci.FLOAT16,
                  utils.DtypeForDavinci.FLOAT32,
                  utils.DtypeForDavinci.INT32,
                  utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.UINT8))

    shape = get_shape(x)

    if shape[0] != 3:
        raise RuntimeError(
            "The first axis of input must be 3, actual input is %d" % shape[0])

    inp_dtype = x.dtype
    need_type_convert = inp_dtype in ("int8", "uint8")

    shape = get_shape(x)
    shp = shape[1:]

    if need_type_convert:
        x = Cast(x, "float16", target=utils.CCE)
        y = Cast(y, "float16", target=utils.CCE)

    a0b1 = tvm.compute(shp, lambda *i: x(0, *i) * y(1, *i), name="a0b1")
    a0b2 = tvm.compute(shp, lambda *i: x(0, *i) * y(2, *i), name="a0b2")
    a1b0 = tvm.compute(shp, lambda *i: x(1, *i) * y(0, *i), name="a1b0")
    a1b2 = tvm.compute(shp, lambda *i: x(1, *i) * y(2, *i), name="a1b2")
    a2b0 = tvm.compute(shp, lambda *i: x(2, *i) * y(0, *i), name="a2b0")
    a2b1 = tvm.compute(shp, lambda *i: x(2, *i) * y(1, *i), name="a2b1")

    res0 = tvm.compute(shp, lambda *i: a1b2(*i) - a2b1(*i), name="res0")
    res1 = tvm.compute(shp, lambda *i: a2b0(*i) - a0b2(*i), name="res1")
    res2 = tvm.compute(shp, lambda *i: a0b1(*i) - a1b0(*i), name="res2")

    res = tvm.compute(
        shape,
        lambda *i: tvm.expr.Select(
            i[0] == 0, res0(*i[1:]),
            tvm.expr.Select(i[0] == 1, res1(*i[1:]), res2(*i[1:]))),
        name='res')

    if need_type_convert:
        res = Cast(res, inp_dtype, target=utils.CCE)

    return res
Exemple #16
0
def reduce_min_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def get_shape(pld):
        return [d.value for d in pld.shape]

    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    #only works for last axis and 2D. Need to extend to multiple dimension and axes.
    def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array):
        data = inputs[0]
        shape = get_shape(data)
        if len(get_shape(data)) == 2:
            # add an extra stage to avoid alignment problem
            min_input = akg.tvm.compute(data.shape,
                                        lambda *i: data(*i),
                                        name="min_input")
            min_ = akg.lang.ascend.reduce_min(min_input,
                                              axis=-1,
                                              keepdims=True)
            min_broadcast = akg.lang.ascend.broadcast(min_, shape)
            if dtype != "float16":
                data = Cast(data, "float16", target=utils.CCE)
            return [
                akg.tvm.compute(shape,
                                lambda i, j: akg.tvm.expr.Select(
                                    data[i, j] == min_broadcast[i, j], grad[i],
                                    akg.tvm.const(0, dtype="float16")),
                                name="reduce_min_ad2")
            ]

    l = reduce_min(data, axis, target=utils.CCE)
    head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype)
    head_cast = Cast(head, "float16", target=utils.CCE)

    [dl_ddata
     ] = akg.differentiate(l, [data],
                           head_cast,
                           None,
                           None,
                           override={l: ([data], custom_reduce_min_fdiff)})

    s = akg.tvm.create_schedule([dl_ddata.op])

    head_ub = s.cache_read(head, "local.UB", [head_cast])
    if dtype == "float16":
        data_ub = s.cache_read(data, "local.UB", [dl_ddata])
    else:
        data_ub = s.cache_read(data, "local.UB",
                               [dl_ddata.op.input_tensors[0]])
        min_input_ub = s.cache_read(
            dl_ddata.op.input_tensors[1].op.input_tensors[0].op.
            input_tensors[0].op.input_tensors[0].op.input_tensors[0],
            "local.UB", [
                dl_ddata.op.input_tensors[1].op.input_tensors[0].op.
                input_tensors[0].op.input_tensors[0]
            ])
        s[dl_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0].
          op.input_tensors[0]].set_scope("local.UB")

    dl_ddata_ub = s.cache_write(dl_ddata, "local.UB")

    # tiling
    split_axis = {}
    for i in range(len(attrs['tile'])):
        split_axis["axis" + str(i)] = s[dl_ddata].split(
            dl_ddata.op.axis[i], attrs["tile"][i])

    split_axis_sorted = sorted(split_axis.items())

    if dtype == "float16":
        s[data_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0])
    else:
        s[data_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0])
        s[dl_ddata.op.input_tensors[0]].compute_at(s[dl_ddata],
                                                   split_axis_sorted[-1][1][0])
        s[dl_ddata.op.input_tensors[0]].set_scope("local.UB")
        s[min_input_ub].compute_at(s[dl_ddata], split_axis_sorted[0][1][1])

    s[head_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0])
    s[head_cast].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0])
    s[head_cast].set_scope("local.UB")
    s[dl_ddata.op.input_tensors[1]].compute_at(s[dl_ddata],
                                               split_axis_sorted[-1][1][0])
    s[dl_ddata.op.input_tensors[1]].set_scope("local.UB")
    s[dl_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at(
        s[dl_ddata], split_axis_sorted[0][1][1])
    s[dl_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB")
    s[dl_ddata.op.input_tensors[1].op.input_tensors[0].op.
      input_tensors[0]].compute_at(s[dl_ddata], split_axis_sorted[0][1][1])
    s[dl_ddata.op.input_tensors[1].op.input_tensors[0].op.
      input_tensors[0]].set_scope("local.UB")

    # L is not being used for computation
    # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    # s[L].set_scope("local.UB"1

    s[dl_ddata_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0])

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, head, dl_ddata],
                        "cce",
                        name="reduce_min_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_min_ad_manual_schedule"
        create_code(kernel_name, './', source_code)
    return mod
Exemple #17
0
def smooth_l1_loss(prediction,
                   targets,
                   anchor_samples,
                   anchor_sample_correct=0,
                   delta=1.0):
    """
    Smooth l1 loss.

    For each value x in `error=predictions-target`, the following is calculated:

    .. math::
        y = \\left\\{
            \\begin{array}{rcl}
                0.5 x^2, & & if \\left| x \\right| <= d \\\\
                0.5 d^2 + d \\cdot (\\left| x \\right| - d), & & if \\left| x \\right| > d
            \\end{array}
        \\right.

    `anchor_samples` acts as a condition for the loss.
    if anchor_samples == anchor_sample_correct, loss = 0, else loss=loss(attention pls)

    Args:
        prediction (tvm.tensor.Tensor): A float tensor of shape
            [batch_size, num_anchors, code_size] representing the (encoded)
            predicted locations of objects.
        targets (tvm.tensor.Tensor): A float tensor of shape
            [batch_size, num_anchors, code_size]
            representing the regression targets
        anchor_samples (tvm.tensor.Tensor): A int tensor of shape [batch_size, num_anchors]
        anchor_sample_correct (int): int, the threshold of anchor_samples
        delta (float): float, the point where the loss function changes from a quadratic to linear.

    Returns:
        loss (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors] tensor
               representing the value of the loss function.
    """
    dim_info, _ = smooth_l1_loss_set_dim_func(prediction, targets,
                                              anchor_samples,
                                              anchor_sample_correct, delta)
    attrs = {DIM: dim_info}

    prediction_dtype = prediction.dtype
    target_dtype = targets.dtype
    anchor_samples_dtype = anchor_samples.dtype

    utils.elemwise_dtype_check(prediction_dtype, target_dtype,
                               utils.DtypeForDavinci.ALL_FLOAT)
    utils.ops_dtype_check(
        anchor_samples_dtype,
        [utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.INT32])

    if anchor_sample_correct > 5 or anchor_sample_correct < 0:
        raise ValueError("anchor_sample_correct attr only support [0,5]")

    # check shape dim
    prediction_shape = get_shape(prediction)
    if len(prediction_shape) != 3:
        raise RuntimeError("Prediction shape only support 3-dim!")

    target_shape = get_shape(targets)
    if len(target_shape) != 3:
        raise RuntimeError("Target shape only support 3-dim!")

    anchor_samples_shape = get_shape(anchor_samples)
    if len(anchor_samples_shape) != 2:
        raise RuntimeError("weights shape only support 2-dim!")

    prediction_dtype_old = prediction_dtype

    if product_is_mini() and prediction_dtype == 'float32':
        prediction = akg.topi.cast(prediction, "float16")
        targets = akg.topi.cast(targets, "float16")
        prediction_dtype = "float16"

    # cast anchor_samples to float type in order to use the vcmp instruction
    if anchor_samples.dtype.lower() != prediction_dtype.lower():
        anchor_samples = Cast(anchor_samples,
                              prediction_dtype,
                              target=utils.CCE)
    anchor_samples_dtype = anchor_samples.dtype.lower()

    coefficient = akg.tvm.const(0.5, dtype=prediction_dtype)
    delta = akg.tvm.const(delta, dtype=prediction_dtype)

    error = akg.topi.subtract(prediction, targets)
    abs_error = akg.topi.abs(error)
    quadratic = akg.topi.minimum(abs_error, delta)
    linear = akg.topi.subtract(abs_error, quadratic)
    loss = akg.topi.add(
        akg.topi.multiply(coefficient, akg.topi.multiply(quadratic,
                                                         quadratic)),
        akg.topi.multiply(delta, linear))
    loss = akg.topi.sum(loss, axis=-1)
    loss = akg.tvm.compute(loss.shape,
                           lambda *i: akg.tvm.expr.Select(
                               anchor_samples(*i) == anchor_sample_correct,
                               akg.tvm.const(0, loss.dtype), loss(*i)),
                           name="loss")

    if product_is_mini() and prediction_dtype_old == 'float32':
        loss = akg.topi.cast(loss, prediction_dtype_old)

    return loss, attrs
Exemple #18
0
def reduce_max_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        data_ = inputs[0]
        shape = data_.shape
        # reduces maximum value for each column
        max_ = akg.lang.ascend.reduce_max(data_, axis=axis, keepdims=True)
        # copies reduced values to get the original shape
        max_broadcast = akg.lang.ascend.broadcast(max_, shape)
        # head broadcast is needed to generate correct cce code for the selection operation
        head_broadcast = akg.tvm.compute(
            shape, lambda *indices: head_(*get_reduced_indices(
                *indices, axis=axis, keepdims=keepdims)))
        # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output
        max_values_and_zeros = akg.tvm.compute(
            shape,
            lambda *indices: akg.tvm.expr.Select(
                data_(*indices) == max_broadcast(*indices),
                head_broadcast(*indices), akg.tvm.const(0, dtype='float16')),
            name="reduce_max_ad2")
        # cast data back to the original dtype
        if dtype != 'float16':
            return [Cast(max_values_and_zeros, dtype, target=utils.CCE)]
        else:
            return [max_values_and_zeros]

    # tensor for the input data
    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    # computation of reduce max
    # not used on the schedule because this is the diferentiation op
    l = reduce_max(data, axis, keepdims, target=utils.CCE)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype)

    # cast input data
    if dtype != 'float16':
        data_cast = Cast(data, "float16", target=utils.CCE)
        head_cast = Cast(head, "float16", target=utils.CCE)
    else:
        data_cast = data
        head_cast = head

    # override differentiation computation with custom function
    [dl_ddata] = akg.differentiate(
        l, [data_cast],
        head_cast,
        None,
        None,
        override={l: ([data_cast], custom_reduce_max_fdiff)})

    # get tensors from custom function
    if dtype != 'float16':
        max_values_and_zeros = dl_ddata.op.input_tensors[0]
        max_broadcast = max_values_and_zeros.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = max_values_and_zeros.op.input_tensors[2]
    else:
        max_broadcast = dl_ddata.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = dl_ddata.op.input_tensors[2]

    # schedule for differetiation operation
    # inputs: data and head
    s = akg.tvm.create_schedule([dl_ddata.op])

    # cache reads of inputs
    if dtype != 'float16':
        head_ub = s.cache_read(head, "local.UB", [head_cast])
        data_ub = s.cache_read(data, "local.UB", [data_cast])
    else:
        # no cast operation
        head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast])
        data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata])

    # cache write for the output
    dl_ddata_ub = s.cache_write(dl_ddata, "local.UB")

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    assert len(tiling_factors) == len(dl_ddata.shape)
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index],
                                                 factor))

    # get iterators
    iterator1 = split_iterators[0][0]

    # move computation of when there is a cast
    if dtype != "float16":
        s[data_cast].compute_at(s[dl_ddata], iterator1)
        s[data_cast].set_scope("local.UB")
        s[head_cast].compute_at(s[dl_ddata], iterator1)
        s[head_cast].set_scope("local.UB")
        s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1)
        s[max_values_and_zeros].set_scope("local.UB")

    # move cache reads and writes
    s[data_ub].compute_at(s[dl_ddata], iterator1)
    s[head_ub].compute_at(s[dl_ddata], iterator1)
    s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1)

    # move computation of the diferentiation
    s[max_].compute_at(s[dl_ddata], iterator1)
    s[max_].set_scope("local.UB")
    s[max_broadcast].compute_at(s[dl_ddata], iterator1)
    s[max_broadcast].set_scope("local.UB")
    s[head_broadcast].compute_at(s[dl_ddata], iterator1)
    s[head_broadcast].set_scope("local.UB")

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, dl_ddata],
                        "cce",
                        name="reduce_max_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_max_ad_manual_schedule"
        create_code(kernel_name, './', source_code)
    return mod