コード例 #1
0
def _bessel_i1e_compute(input_data):
    """bessel i1e compute"""

    shape = vc_util.get_shape(input_data)
    dtype = input_data.dtype

    # chose the type of data in begin
    if dtype == "float16":
        input_data = cast(input_data, "float32")

    abs_data = abs_value(input_data)
    # compute bessel_i1e for data in (-3.75, 3.75)
    before_res = _before_res_compute(abs_data)
    # compute bessel_i1e for data in other domain
    after_res = _after_res_compute(abs_data)

    # As vcmp_lt and vsel instruction don't support fp32 on mini
    # It can be simplified by some methods, such as , "auto cast"
    if utils.product_is_mini():
        res = akg.tvm.compute(
            shape, lambda *indice: akg.tvm.expr.Select(
                abs_data[indice].astype("float16") < akg.tvm.const(
                    CONST_LIMIT, "float16"), before_res[indice].astype(
                        "float16"), after_res[indice].astype("float16")))
        res = cast(res, "float32")
    else:
        res = akg.tvm.compute(
            shape,
            lambda *indice: akg.tvm.expr.Select(abs_data[
                indice] < CONST_LIMIT, before_res[indice], after_res[indice]))
    data_sign = sign(input_data)
    res = mul(res, data_sign)
    if dtype == "float16":
        res = cast(res, "float16")
    return res
コード例 #2
0
ファイル: pad.py プロジェクト: zhuyawen/akg
def pad(data, paddings, padtype):
    """add paddings to the tensor
    :shape: The shape of the tensor, now only support two dimension Tensor
    :paddings: The shape of the paddings, shape [N,2], N is the dimension of the tensor,
     For each dimension D of input, paddings[D, 0] indicates how many values to add before
     the contents of tensor in that dimension, and paddings[D, 1] indicates how many values to
     add after the contents of tensor in that dimension.
    :dtype: The type of the input, float16, float32
    :padtype: One of "CONSTANT", "REFLECT", or "SYMMETRIC".
    """
    # check shape
    vc_util.check_shape(data.shape)
    # check types
    vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_TYPES)
    # check padding types
    ptype_checklist = ['constant']
    if not (padtype in ptype_checklist):
        raise RuntimeError("pad_cce only support %s while padtype is %s" % (",".join(ptype_checklist), padtype))

    dtype = data.dtype
    if dtype == 'int8' or dtype == 'uint8':
        data = cast(data, "float16")

    rank = len(data.shape)
    pad_before = []
    pad_after = []
    for i in range(rank):
        pad_before.append(paddings[i][0])
        pad_after.append(paddings[i][1])
    B = tvm_pad(data, pad_before, pad_after=pad_after, name='B')

    if dtype == 'int8' or dtype == 'uint8':
        B = cast(B, dtype)
    return B
コード例 #3
0
ファイル: broadcast_to.py プロジェクト: zhuyawen/akg
def broadcast_to(x, shape):
    """
    Broadcast an tensor to a compatible shape.

    Args:
        x (tvm.tensor.Tensor): Tensor of type float32, float16, int8, uint8, int32
        shape (list, tuple): The shape of output tensor.

    Returns:
        An tvm.tensor.Tensor with the same type as x.

    """
    # check shape
    vc_util.check_shape(x)
    vc_util.check_shape(shape)

    # check dtype
    dtype = x.dtype
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_TYPES)

    # vector_dup instruction don't support int8 and uint8
    # It can be simplified by some methods, such as , "auto cast"
    x_shape = get_shape(x)
    if len(x_shape) == 1 and x_shape[0] == 1 and dtype in ["int8", "uint8"]:
        x = cast(x, "float16")

    res = topi.broadcast_to(x, shape)
    if res.dtype != dtype:
        res = cast(res, dtype)
    return res
コード例 #4
0
ファイル: minimum.py プロジェクト: zhuyawen/akg
def minimum(input1, input2):
    """
    Return the min value of two tensors element-wise.

    Note:
        minimum supports broadcasting.

    Args:
        input1: Tensor.
        input2: Tensor. Has the same type as input1.

    Returns:
        Tensor, has the same type as inputs.
    """

    vc_util.ops_dtype_check([input1.dtype, input2.dtype], vc_util.DtypeForDavinci.ALL_TYPES)
    vc_util.elemwise_dtype_check(input1.dtype, input2.dtype)
    dtype = input1.dtype

    shape1 = [x.value for x in input1.shape]
    shape2 = [x.value for x in input2.shape]
    vc_util.check_shape(shape1)
    vc_util.check_shape(shape2)

    vc_util.auto_broadcast_check(shape1, shape2)

    if dtype in ("int8", "uint8"):
        input1 = cast(input1, "float16")
        input2 = cast(input2, "float16")
    res = akg.topi.minimum(input1, input2)
    if dtype in ("int8", "uint8"):
        res = cast(res, dtype)

    return res
コード例 #5
0
def div(data1, data2):
    """
    Calculates x/y, and returns an integer when inputs are all integers.

    When both arguments are integers, use integer division (also known as "floor division").
    When arguments are float numbers, use normal floating point division

    Note:
        div supports broadcasting.

    Args:
        data1 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8.
        data2 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8.

    Returns:
        tvm.tensor.Tensor, has the same type as data1 and data2.
    """

    vc_util.ops_dtype_check([data1.dtype, data2.dtype],
                            vc_util.DtypeForDavinci.ALL_TYPES)
    vc_util.elemwise_dtype_check(data1.dtype, data2.dtype)
    dtype = data1.dtype

    shape1 = [x.value for x in data1.shape]
    shape2 = [x.value for x in data2.shape]
    vc_util.check_shape(shape1)
    vc_util.check_shape(shape2)

    vc_util.auto_broadcast_check(shape1, shape2)
    n_shape1, n_shape2, out_shape = produce_shapes(shape1, shape2)
    if n_shape1 != out_shape:
        input1_cast = akg.topi.broadcast_to(data1, out_shape)
    else:
        input1_cast = data1
    if n_shape2 != out_shape:
        input2_cast = akg.topi.broadcast_to(data2, out_shape)
    else:
        input2_cast = data2

    if dtype in ("int32", "int8", "uint8"):
        input1p = cast(input1_cast, "float16")
        input2p = cast(input2_cast, "float16")
    else:
        input1p = input1_cast
        input2p = input2_cast

    if utils.product_is_mini():
        input2p_rec = reciprocal(input2p)
        res = akg.topi.multiply(input1p, input2p_rec)
    else:
        res = akg.topi.divide(input1p, input2p)

    if dtype in ("int8", "uint8"):
        res = floor(res)
        res = cast(res, "float16")
    if dtype in ("int32", "int8", "uint8"):
        res = cast(res, dtype)

    return res
コード例 #6
0
ファイル: approximate_equal.py プロジェクト: zhuyawen/akg
def approximate_equal(x, y, tolerance=1e-5):
    """
    abs(x-y) less than or equal to the tolerance

    Args:
        x (tvm.tensor.Tensor): Tensor of type float16, float32.
        y (tvm.tensor.Tensor): Tensor of type float16, float32.
        tolerance (float): default is 1e-5

    Returns:
        tvm.tensor.Tensor. If abs(x-y) less than or equal to the tolerance return True,
        else return False.
    """

    if tolerance < 0:
        raise RuntimeError("tolerance should >= 0")

    # check shape
    vc_util.check_shape(x)
    vc_util.check_shape(y)
    shape = get_shape(x)
    if shape != get_shape(y):
        raise RuntimeError("input shape must be same, but got %s vs %s", shape,
                           get_shape(y))

    # check input tensor data_type
    vc_util.ops_dtype_check(x.dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    vc_util.ops_dtype_check(y.dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    dtype = x.dtype
    if dtype != y.dtype:
        raise RuntimeError("input type must be same, but got %s  vs %s", dtype,
                           y.dtype)

    res_vsub = sub(x, y)
    res_vabs = abs_value(res_vsub)

    # As vcmp_lt and vsel instruction don't support fp32 on mini
    # It can be simplified by some methods, such as , "auto cast"
    if utils.product_is_mini():
        dtype = "float16"
        res_vabs = cast(res_vabs, dtype)

    t = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(1, dtype), "t")
    f = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(0, dtype), "f")
    res = akg.tvm.compute(
        shape, lambda *indice: akg.tvm.expr.Select(
            res_vabs[indice] <= akg.tvm.const(tolerance, dtype), t[indice], f[
                indice]))

    #  It can be be simplified that let cast op support fp16 and fp32 to bool type
    res_fp16 = cast(res, "float16")
    res_bool = akg.tvm.compute(
        shape, lambda *indice: res_fp16(*indice).astype("bool"))
    return res_bool
コード例 #7
0
def reduce_min_max(data, axis=None, keepdims=False, method="min"):
    """
    Computes the maximum or minimum of elements over a given axis or a list of axes of a tensor.

    Args:
        data (tvm.tensor.Tensor): The input tensor to reduce. Should be of type float16, float32, int8, uint8, int32.
        axis (Union[list, tuple, int, None]): The dimensions to reduce.
                                      If None, all dimensions will be reduced.
                                      If int or list, must be in the range [-len(data.shape), len(data.shape) - 1].
        keepdims (bool): If True, retains reduced dimensions with length 1, default value is False.
        method (str): Specifies to compute maximum or minimum of input tensor, default value is min.

    Returns:
        tvm.tensor.Tensor of same type as input tensor data.
    """
    # check shape
    vc_util.check_shape(data.shape)

    # check type
    dtype = data.dtype
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_TYPES)

    # check axis
    shape_len = len(data.shape)
    if axis is None:
        axis = range(shape_len)
    if hasattr(axis, 'index'):
        axis = list(axis)
    if isinstance(axis, int):
        axis = [axis]
    vc_util.is_valid_reduce_axis(data, axis)
    refined_axis = refine_reduce_axis(data, axis)
    if len(set(refined_axis)) == len(data.shape) and not keepdims:
        raise ValueError("When reducing on all axes of input, keepdim should be set to True.")
    # check method
    method_list = ["min", "max"]
    if method not in method_list:
        raise ValueError("supported method %s while given method is %s" % (",".join(method_list), method))

    # In the emit_insn pass, for vmin and vmax, reduce_last_axis only support float16.
    if dtype != "float16":
        data = cast(data, "float16")

    if method == "min":
        res = akg.topi.min(data, axis=axis, keepdims=keepdims)
    else:
        res = akg.topi.max(data, axis=axis, keepdims=keepdims)

    if res.dtype != dtype:
        res = cast(res, dtype)

    return res
コード例 #8
0
def equal_count(x, y):
    """
    compute equal num of x and y.

    Args:
        x (tvm.tensor.Tensor): Tensor of type int32.
        y (tvm.tensor.Tensor): Tensor of type int32.

    Returns:
        tvm.tensor.Tensor, equal num, type is int32.
    """
    # check shapes
    shape1 = get_shape(x)
    shape2 = get_shape(y)
    shapes = [shape1, shape2]
    for _, shape_ in enumerate(shapes):
        vc_util.check_shape(shape_)
    if len(shape1) != 1 or len(shape2) != 1:
        raise RuntimeError("Two inputs should all be one dim!")

    # check types
    dtype = x.dtype
    vc_util.ops_dtype_check([x.dtype, y.dtype], vc_util.DtypeForDavinci.INT32)

    # Due to instruction limitations, the int32 data needs to be converted to
    # float16 or float32.
    # When the int32 data is casted to float16, there may be overflow problems,
    # so as far as possible the int32 data is casted to float32.
    orig_dtype = dtype
    if product_is_mini():
        dtype = "float16"
    else:
        dtype = "float32"
    x = cast(x, dtype)
    y = cast(y, dtype)

    shape1, shape2, shape = produce_shapes(shape1, shape2)
    t = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(1, dtype), "t")
    f = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(0, dtype), "f")
    x = akg.topi.broadcast_to(x, shape)
    y = akg.topi.broadcast_to(y, shape)
    z = akg.tvm.compute(shape,
                        lambda *indice: akg.tvm.expr.Select(
                            x[indice] == y[indice], t[indice], f[indice]),
                        name="z")
    res, _ = sum_value(z)
    if res.dtype != orig_dtype:
        res = cast(res, orig_dtype)
    return res
コード例 #9
0
def truncate_div_compute(input_x1, input_x2):
    """compute for truncate_div"""
    int_list = ("int32", "int8", "uint8")

    if input_x1.dtype in int_list:
        data_zero = dc.zero_const("float32")
        data_x_broad = cast(input_x1, "float32")
        data_y_broad = cast(input_x2, "float32")
        res_div = topi.divide(data_x_broad, data_y_broad)
        res_min_int = ceil(topi.minimum(res_div, data_zero))
        res_max_int = floor(topi.maximum(res_div, data_zero))
        res_trunc = topi.add(res_min_int, res_max_int)
        res_trunc = cast(res_trunc, "float32")
    else:
        res_trunc = topi.divide(input_x1, input_x2)

    return cast(res_trunc, input_x1.dtype)
コード例 #10
0
def truncatemod(x, y):
    """
    Computes remainder of division(x / y).

    Note:
        res = x - y*trunc(x/y)

    Args:
        x(tvm.tensor.Tensor): Input tensor, support float16 on mini device, while support
                           int32, int8, uint8, float16, float32 on cloud ones.
        y(tvm.tensor.Tensor): Tensor with same type as input tensor x.

    Returns:
        tvm.tensor.Tensor of same type as input tensors.
    """

    vc_util.check_shape(x)
    vc_util.check_shape(y)
    vc_util.elemwise_dtype_check(x.dtype, y.dtype)
    dtype = x.dtype
    support_dtype = [
        vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32,
        vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8
    ]
    if utils.product_is_mini():
        support_dtype = [vc_util.DtypeForDavinci.FLOAT16]

    vc_util.ops_dtype_check(dtype, support_dtype)

    if not utils.product_is_mini():
        # The high precision compute is required.
        # For brevity, lex x = 132.05, y = 131.95; x and y are very close, but the difference between trunc(x)=132
        # and trunc(y)=131 is 1
        if dtype != "float32":
            x = cast(x, "float32")
            y = cast(y, "float32")
        res = akg.topi.mod(x, y)
    else:
        res = _truncatemod_compute_mini(x, y)

    if res.dtype != dtype:
        res = cast(res, dtype)
    return res
コード例 #11
0
ファイル: equal.py プロジェクト: zhuyawen/akg
def equal(input1, input2):
    """
    check whether input1 equals to input2.

    Args:
        input1 (tvm.tensor.Tensor): input argument has type float16, float32 and int32.
        input2 (tvm.tensor.Tensor): input argument has type float16, float32 and int32.

    Returns:
        tvm.tensor.Tensor. If input1 equal to input2 return True, else return False.
    """
    # check shapes
    shape1 = [x.value for x in input1.shape]
    shape2 = [x.value for x in input2.shape]
    shapes = [shape1, shape2]
    for _, shp in enumerate(shapes):
        vc_util.check_shape(shp)

    vc_util.ops_dtype_check([input1.dtype, input2.dtype], [
        vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32,
        vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8
    ])

    dtype = input1.dtype
    orig_dtype = dtype
    if utils.product_is_mini() and dtype != "float16":
        dtype = "float16"
    if (not utils.product_is_mini()) and dtype not in ("float16", "float32"):
        # for int32, if cast to float16, there may be overflow
        dtype = "float32"

    if orig_dtype == "float32" and dtype == "float16":
        input_sub = sub(input1, input2)
        input_sub = cast(input_sub, dtype)
        zero = akg.tvm.const(0.0, dtype)
        res = akg.topi.equal(input_sub, zero)
    else:
        input1 = cast(input1, dtype)
        input2 = cast(input2, dtype)
        res = akg.topi.equal(input1, input2)
    return res
コード例 #12
0
ファイル: sum.py プロジェクト: zhuyawen/akg
def sum_v2(inputs, axis=None, keepdims=True):
    """another implementation of sum with topi api."""
    dtype = inputs.dtype
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    axis = ft_util.refine_reduce_axis(inputs, axis)
    vc_util.check_shape(inputs.shape)
    if not axis:
        output = akg.topi.identity(inputs)
    else:
        if dtype == "float16":
            step_sum = cast(inputs, "float32")
        else:
            step_sum = inputs

        step_sum = akg.topi.sum(step_sum, axis=axis, keepdims=keepdims)

        if dtype == "float16":
            output = cast(step_sum, "float16")
        else:
            output = step_sum
    attr_map = get_attrs()
    return output, attr_map
コード例 #13
0
ファイル: pow.py プロジェクト: zhuyawen/akg
def pow_value(data, scale):
    shape1 = [x.value for x in data.shape]
    shape2 = [x.value for x in scale.shape]

    check_list = ["float16", "float32", "int32", "int8", "uint8"]
    dtype = data.dtype
    if not (dtype.lower() in check_list):
        raise RuntimeError("tile_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    shape = [x.value for x in data.shape]
    vc_util.check_shape(shape)
    vc_util.auto_broadcast_check(shape1, shape2)
    compute_dtype = "float32"
    if utils.product_is_mini():
        compute_dtype = "float16"
    data = cast(data, compute_dtype)
    scale = cast(scale, compute_dtype)

    C = akg.topi.power(data, scale)
    C = cast(C, dtype)
    return C
コード例 #14
0
def bitwise_not(data):
    """
    Bitwise-not.

    Args:
        data (tvm.tensor.Tensor): Input data of type int8 or int32.

    Returns:
        tvm.tensor.Tensor, Bitwise-not result.
    """
    vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_INT)
    vc_util.check_shape(data.shape)

    one = akg.tvm.const(1, dtype=data.dtype)
    minus_one = akg.tvm.const(-1, dtype=data.dtype)
    add_one = akg.lang.cce.vadds(data, one)
    multiply_one = akg.lang.cce.vmuls(add_one, minus_one)
    res = cast(multiply_one, data.dtype)
    return res
コード例 #15
0
ファイル: reduce_min_ad.py プロジェクト: zhuyawen/akg
 def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array):
     data = inputs[0]
     shape = get_shape(data)
     if len(get_shape(data)) == 2:
         # add an extra stage to avoid alignment problem
         min_input = akg.tvm.compute(data.shape,
                                     lambda *i: data(*i),
                                     name="min_input")
         min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True)
         min_broadcast = akg.lang.cce.broadcast(min_, shape)
         if dtype != "float16":
             data = cast(data, "float16")
         return [
             akg.tvm.compute(shape,
                             lambda i, j: akg.tvm.expr.Select(
                                 data[i, j] == min_broadcast[i, j], grad[i],
                                 akg.tvm.const(0, dtype="float16")),
                             name="reduce_min_ad2")
         ]
コード例 #16
0
ファイル: ones_like.py プロジェクト: zhuyawen/akg
def ones_like(input):
    """
    Generate an array of ones.

    Args:
        input (tvm.tensor.Tensor): Tensor,Should be of type float16, float32, int32, uint8, int8.

    Returns:
        tvm.tensor.Tensor with the same type and shape as input.
    """
    dtype = input.dtype
    shape = get_shape(input)
    vc_util.ops_dtype_check(dtype, [vc_util.DtypeForDavinci.ALL_TYPES])
    vc_util.check_shape(shape)
    res = akg.tvm.compute(shape,
                          lambda *i: akg.tvm.const(1, "float16"),
                          name="res",
                          attrs={'no_inline': 1})
    res = cast(res, dtype)
    return res
コード例 #17
0
ファイル: reduce_max_ad.py プロジェクト: RyanWhb/akg
 def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
     data_ = inputs[0]
     shape = data_.shape
     # reduces maximum value for each column
     max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True)
     # copies reduced values to get the original shape
     max_broadcast = akg.lang.cce.broadcast(max_, shape)
     # head broadcast is needed to generate correct cce code for the selection operation
     head_broadcast = akg.tvm.compute(
         shape, lambda *indices: head_(*get_reduced_indices(
             *indices, axis=axis, keepdims=keepdims)))
     # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output
     max_values_and_zeros = akg.tvm.compute(
         shape,
         lambda *indices: akg.tvm.expr.Select(
             data_(*indices) == max_broadcast(*indices),
             head_broadcast(*indices), akg.tvm.const(0, dtype='float16')),
         name="reduce_max_ad2")
     # cast data back to the original dtype
     if dtype != 'float16':
         return [cast(max_values_and_zeros, dtype)]
     else:
         return [max_values_and_zeros]
コード例 #18
0
ファイル: reduce_min_ad.py プロジェクト: zhuyawen/akg
def reduce_min_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def get_shape(pld):
        return [d.value for d in pld.shape]

    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    #only works for last axis and 2D. Need to extend to multiple dimension and axes.
    def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array):
        data = inputs[0]
        shape = get_shape(data)
        if len(get_shape(data)) == 2:
            # add an extra stage to avoid alignment problem
            min_input = akg.tvm.compute(data.shape,
                                        lambda *i: data(*i),
                                        name="min_input")
            min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True)
            min_broadcast = akg.lang.cce.broadcast(min_, shape)
            if dtype != "float16":
                data = cast(data, "float16")
            return [
                akg.tvm.compute(shape,
                                lambda i, j: akg.tvm.expr.Select(
                                    data[i, j] == min_broadcast[i, j], grad[i],
                                    akg.tvm.const(0, dtype="float16")),
                                name="reduce_min_ad2")
            ]

    L = reduce_min.reduce_min(data, axis)
    head = akg.tvm.placeholder(L.shape, name="head", dtype=L.dtype)
    head_cast = cast(head, "float16")

    [dL_ddata
     ] = akg.differentiate(L, [data],
                           head_cast,
                           None,
                           None,
                           override={L: ([data], custom_reduce_min_fdiff)})

    s = akg.tvm.create_schedule([dL_ddata.op])

    head_ub = s.cache_read(head, "local.UB", [head_cast])
    if dtype == "float16":
        data_ub = s.cache_read(data, "local.UB", [dL_ddata])
    else:
        data_ub = s.cache_read(data, "local.UB",
                               [dL_ddata.op.input_tensors[0]])
        min_input_ub = s.cache_read(
            dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
            input_tensors[0].op.input_tensors[0].op.input_tensors[0],
            "local.UB", [
                dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
                input_tensors[0].op.input_tensors[0]
            ])
        s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0].
          op.input_tensors[0]].set_scope("local.UB")

    dL_ddata_ub = s.cache_write(dL_ddata, "local.UB")

    # tiling
    split_axis = {}
    for i in range(len(attrs['tile'])):
        split_axis["axis" + str(i)] = s[dL_ddata].split(
            dL_ddata.op.axis[i], attrs["tile"][i])

    split_axis_sorted = sorted(split_axis.items())

    if dtype == "float16":
        s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    else:
        s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
        s[dL_ddata.op.input_tensors[0]].compute_at(s[dL_ddata],
                                                   split_axis_sorted[-1][1][0])
        s[dL_ddata.op.input_tensors[0]].set_scope("local.UB")
        s[min_input_ub].compute_at(s[dL_ddata], split_axis_sorted[0][1][1])

    s[head_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    s[head_cast].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    s[head_cast].set_scope("local.UB")
    s[dL_ddata.op.input_tensors[1]].compute_at(s[dL_ddata],
                                               split_axis_sorted[-1][1][0])
    s[dL_ddata.op.input_tensors[1]].set_scope("local.UB")
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at(
        s[dL_ddata], split_axis_sorted[0][1][1])
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB")
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
      input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[0][1][1])
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
      input_tensors[0]].set_scope("local.UB")

    # L is not being used for computation
    # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    # s[L].set_scope("local.UB"1

    s[dL_ddata_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, head, dL_ddata],
                        "cce",
                        name="reduce_min_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_min_ad_manual_schedule"
        utils.create_code(kernel_name, './', source_code)
    return mod
コード例 #19
0
ファイル: reduce_max_ad.py プロジェクト: RyanWhb/akg
def reduce_max_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        data_ = inputs[0]
        shape = data_.shape
        # reduces maximum value for each column
        max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True)
        # copies reduced values to get the original shape
        max_broadcast = akg.lang.cce.broadcast(max_, shape)
        # head broadcast is needed to generate correct cce code for the selection operation
        head_broadcast = akg.tvm.compute(
            shape, lambda *indices: head_(*get_reduced_indices(
                *indices, axis=axis, keepdims=keepdims)))
        # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output
        max_values_and_zeros = akg.tvm.compute(
            shape,
            lambda *indices: akg.tvm.expr.Select(
                data_(*indices) == max_broadcast(*indices),
                head_broadcast(*indices), akg.tvm.const(0, dtype='float16')),
            name="reduce_max_ad2")
        # cast data back to the original dtype
        if dtype != 'float16':
            return [cast(max_values_and_zeros, dtype)]
        else:
            return [max_values_and_zeros]

    # tensor for the input data
    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    # computation of reduce max
    # not used on the schedule because this is the diferentiation op
    l = reduce_max.reduce_max(data, axis, keepdims)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype)

    # cast input data
    if dtype != 'float16':
        data_cast = cast(data, "float16")
        head_cast = cast(head, "float16")
    else:
        data_cast = data
        head_cast = head

    # override differentiation computation with custom function
    [dl_ddata] = akg.differentiate(
        l, [data_cast],
        head_cast,
        None,
        None,
        override={l: ([data_cast], custom_reduce_max_fdiff)})

    # get tensors from custom function
    if dtype != 'float16':
        max_values_and_zeros = dl_ddata.op.input_tensors[0]
        max_broadcast = max_values_and_zeros.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = max_values_and_zeros.op.input_tensors[2]
    else:
        max_broadcast = dl_ddata.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = dl_ddata.op.input_tensors[2]

    # schedule for differetiation operation
    # inputs: data and head
    s = akg.tvm.create_schedule([dl_ddata.op])

    # cache reads of inputs
    if dtype != 'float16':
        head_ub = s.cache_read(head, "local.UB", [head_cast])
        data_ub = s.cache_read(data, "local.UB", [data_cast])
    else:
        # no cast operation
        head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast])
        data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata])

    # cache write for the output
    dl_ddata_ub = s.cache_write(dl_ddata, "local.UB")

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    assert len(tiling_factors) == len(dl_ddata.shape)
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index],
                                                 factor))

    # get iterators
    iterator1 = split_iterators[0][0]

    # move computation of when there is a cast
    if dtype != "float16":
        s[data_cast].compute_at(s[dl_ddata], iterator1)
        s[data_cast].set_scope("local.UB")
        s[head_cast].compute_at(s[dl_ddata], iterator1)
        s[head_cast].set_scope("local.UB")
        s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1)
        s[max_values_and_zeros].set_scope("local.UB")

    # move cache reads and writes
    s[data_ub].compute_at(s[dl_ddata], iterator1)
    s[head_ub].compute_at(s[dl_ddata], iterator1)
    s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1)

    # move computation of the diferentiation
    s[max_].compute_at(s[dl_ddata], iterator1)
    s[max_].set_scope("local.UB")
    s[max_broadcast].compute_at(s[dl_ddata], iterator1)
    s[max_broadcast].set_scope("local.UB")
    s[head_broadcast].compute_at(s[dl_ddata], iterator1)
    s[head_broadcast].set_scope("local.UB")

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, dl_ddata],
                        "cce",
                        name="reduce_max_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_max_ad_manual_schedule"
        utils.create_cce(kernel_name, './', source_code)
    return mod
コード例 #20
0
def fused_minimum_or_maximum_grad(dz, x, y, grad_x, grad_y, op_type):
    """
    Gradient for minimum or maximum operation between two input tensors `x` and `y`.

    Args:
        dz (tvm.tensor.Tensor): Type float16, float32, int32.
        x (tvm.tensor.Tensor): Type float16, float32, int32.
        y (tvm.tensor.Tensor): Type float16, float32, int32.
        grad_x (bool): Whether calculate dx.
        grad_y (bool): Whether calculate dy.
        op_type (str): The type of the op, "GE" for MaximumGrad or "LE" for MinimumGrad.

    Note:
        At least one of grad_x and grad_y is True.

    Returns:
        dx, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_x is True.
        dy, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_y is True.
    """
    vc_util.check_shape(x)
    vc_util.check_shape(y)
    vc_util.check_shape(dz)
    vc_util.ops_dtype_check([x.dtype, y.dtype, dz.dtype],
                            [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32])

    vc_util.broadcast_check(x, dz)
    vc_util.broadcast_check(y, dz)

    # check op types
    check_list = ["GE", "LE"]
    if op_type not in check_list:
        raise ValueError("FusedMinimumOrMaximumGrad only support %s while op type is %s" %
                         (",".join(check_list), op_type))

    if not grad_x and not grad_y:
        raise ValueError("At least one of grad_x and grad_y is True.")

    x_shape = get_shape(x)
    y_shape = get_shape(y)
    dz_shape = get_shape(dz)
    ori_dtype = dz.dtype

    # get greater compute
    x = akg.lang.cce.broadcast(x, dz_shape)
    y = akg.lang.cce.broadcast(y, dz_shape)

    if utils.product_is_mini() and ori_dtype != "float16":
        x = cast(x, "float16")
        y = cast(y, "float16")
        dz = cast(dz, "float16")
    elif ori_dtype == "int32":
        x = cast(x, "float32")
        y = cast(y, "float32")
        dz = cast(dz, "float32")
    zero = zero_const(dz.dtype)

    if op_type == "LE":
        dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) <= y(*i)), dz(*i), zero), name='dx')
        dy = topi.subtract(dz, dx)
    elif op_type == "GE":
        dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) >= y(*i)), dz(*i), zero), name='dx')
        dy = topi.subtract(dz, dx)

    if dx.dtype == "float16":
        # cast to fp32 for higher precision of reduce_sum.
        if get_shape(dx) != x_shape:
            dx = cast(dx, "float32")
        if get_shape(dy) != y_shape:
            dy = cast(dy, "float32")

    dx = sum.sum_by_shape(dx, x_shape)
    dy = sum.sum_by_shape(dy, y_shape)

    if ori_dtype != dx.dtype:
        dx = cast(dx, ori_dtype)
    if ori_dtype != dy.dtype:
        dy = cast(dy, ori_dtype)

    attrs = get_default_attrs()
    if grad_x and grad_y:
        return dx, dy, attrs
    if grad_x:
        return dx, attrs
    return dy, attrs
コード例 #21
0
def Cast(x, dst_type):
    """cast."""
    return cast.cast(x, dst_type)
コード例 #22
0
def matmul4D_compute(x,
                     y,
                     bias_value,
                     out_dtype,
                     left_format,
                     right_format,
                     out_format,
                     transpose_x=False,
                     transpose_y=False,
                     attrs=None):
    # for gemv use transpose of AB --> gevm trans(trans(B) * trans(A))

    data_dtype = x.dtype.lower()
    check_list = ["int8", "uint8", "float16", "float32", "int32"]
    if not (data_dtype in check_list):
        raise RuntimeError("matmul_cce ony supports %s while dtype is %s" %
                           (",".join(check_list), x.dtype))

    if bias_value is None:
        bias_name = ''
        bias = 0
    else:
        bias_name = bias_value.name
        bias = 0 if bias_value is None else 1

    output_shape_zN, k = output_shape_compute(x.shape, y.shape, left_format,
                                              right_format, "zN", transpose_x,
                                              transpose_y)
    output_shape_zZ, k = output_shape_compute(x.shape, y.shape, left_format,
                                              right_format, "zZ", transpose_x,
                                              transpose_y)

    shape_A = x.shape
    shape_B = y.shape
    key = ()

    key += (tuple(shape_A), tuple(shape_B), bias, left_format, right_format,
            out_format, transpose_x, transpose_y, x.dtype)
    hash_key = str(key)
    # bypass 2 left matrix ddr -> l0
    # bypass 1 right matrix ddr -> l0
    bypass_list = [0, 1, 2]
    bypass = 0
    if attrs is not None and 'bypass' in attrs:
        bypass = attrs['bypass']
    elif hash_key in matmul_set_dim_map:
        configs = matmul_set_dim_map[hash_key]
        if isinstance(configs, tuple):
            if len(configs) > 1 and "bypass" in configs[1]:
                bypass = configs[1]["bypass"]

    if not (bypass in bypass_list):
        raise RuntimeError("matmul_cce ony supports %s while bypass is %d" %
                           (",".join(str(bypass_list)), bypass))

    def matmul_compute(output_shape, adj_x, adj_y, left_format, right_format,
                       output_format, x, y, k, *indices):
        N = len(output_shape)
        # reduce axis
        ko = akg.tvm.reduce_axis((0, k // cce.BLOCK_REDUCE), name='ko')
        ki = akg.tvm.reduce_axis((0, cce.BLOCK_REDUCE), name='ki')
        if output_format == "zN":
            if left_format == "zZ":
                x_indices = indices[:(N - 4)] + indices[(N - 3):(N - 2)] + (
                    ko, ) + indices[(N - 2):(N - 1)] + (ki, )
                if adj_x:
                    x_indices = indices[:(N - 4)] + (ko, ) + indices[
                        (N - 3):(N - 2)] + (ki, ) + indices[(N - 2):(N - 1)]
            elif left_format == "zN":
                x_indices = indices[:(N - 4)] + (ko, ) + indices[
                    (N - 3):(N - 2)] + indices[(N - 2):(N - 1)] + (ki, )
                if adj_x:
                    x_indices = indices[:(N - 4)] + indices[(N - 3):(
                        N - 2)] + (ko, ) + (ki, ) + indices[(N - 2):(N - 1)]

            if right_format == "nZ":
                y_indices = indices[:(N - 4)] + (ko, ) + indices[
                    (N - 4):(N - 3)] + indices[(N - 1):] + (ki, )
                if adj_y:
                    y_indices = indices[:(N - 4)] + indices[
                        (N - 4):(N - 3)] + (ko, ki) + indices[(N - 1):]
            elif right_format == "zZ":
                y_indices = indices[:(N - 4)] + (ko, ) + indices[
                    (N - 4):(N - 3)] + (ki, ) + indices[(N - 1):]
                if adj_y:
                    y_indices = indices[:(N - 4)] + indices[
                        (N - 4):(N - 3)] + (ko, ) + indices[(N - 1):] + (ki, )
            elif right_format == "zN":
                y_indices = indices[:(N - 4)] + indices[
                    (N - 4):(N - 3)] + (ko, ) + (ki, ) + indices[(N - 1):]
                if adj_y:
                    y_indices = indices[:(N - 4)] + (ko, ) + indices[
                        (N - 4):(N - 3)] + indices[(N - 1):] + (ki, )

        return akg.lang.cce.mmad(
            (x(*x_indices) * y(*y_indices)).astype("float32"), axis=[ko, ki])

    if left_format == "zZ":
        data_trans = "N"
        data_trans_block = "N"
        data_trans_block_in = "N"
        if transpose_x:
            data_trans = "Y"
    elif left_format == "zN":
        data_trans = "Y"
        data_trans_block = "Y"
        data_trans_block_in = "N"
        if transpose_x:
            data_trans = "Y"
            data_trans_block = "N"
            data_trans_block_in = "Y"

    if right_format == "nZ":
        weight_trans = "N"
        weight_trans_block = "N"
        weight_trans_block_in = "N"
        if transpose_y:
            weight_trans = "Y"
    elif right_format == "zZ":
        if not transpose_y:
            weight_trans_block_in = "Y"
            weight_trans_block = "N"
            weight_trans = "Y"
        elif transpose_y:
            weight_trans = "Y"
            weight_trans_block = "Y"
            weight_trans_block_in = "N"
    elif right_format == "zN":
        weight_trans = "Y"
        weight_trans_block = "N"
        weight_trans_block_in = "N"
        if transpose_y:
            weight_trans = "N"
            weight_trans_block = "N"
            weight_trans_block_in = "N"

    result_matmul = akg.tvm.compute(
        output_shape_zN,
        lambda *indices: matmul_compute(output_shape_zN, transpose_x,
                                        transpose_y, left_format, right_format,
                                        "zN", x, y, k, *indices),
        name="resMatmul",
        attrs={
            "pragma_gemm_data": x.name,
            "pragma_data_transpose": data_trans,
            "pragma_data_transpose_block": data_trans_block,
            "pragma_data_transpose_block_inner": data_trans_block_in,
            "pragma_gemm_weight": y.name,
            "pragma_weight_transpose": weight_trans,
            "pragma_weight_transpose_block": weight_trans_block,
            "pragma_weight_transpose_block_inner": weight_trans_block_in,
            "pragma_conv_bypass_l1": bypass,
            "bias": bias_name,
        })

    if out_dtype == "float16" and (bias_value == None
                                   or bias_value.dtype == "float16"):
        result_matmul = cast.cast(result_matmul, out_dtype)

    def matmul_reshape(shape, result_matmul, *indices):
        N = len(shape)
        new_indices = indices[:(N - 4)] + indices[(N - 3):(N - 2)] + indices[
            (N - 4):(N - 3)] + indices[(N - 2):]
        return result_matmul(*new_indices)

    if out_format == "zZ":
        result = akg.tvm.compute(output_shape_zZ,
                                 lambda *indices: matmul_reshape(
                                     output_shape_zZ, result_matmul, *indices),
                                 name="result")
    else:
        result = result_matmul

    def bias_compute(output_shape, result, bias, output_format, *indices):
        N = len(output_shape)
        # reduce axis
        if output_format == "zN":
            bias_indices = indices[N - 4] * cce.BLOCK_OUT + indices[N - 1]
        elif output_format == "zZ":
            bias_indices = indices[N - 3] * cce.BLOCK_OUT + indices[N - 1]
        return result(*indices) + bias(bias_indices)

    if bias == 1:
        if out_format == "zN":
            out = akg.tvm.compute(
                output_shape_zN,
                lambda *indices: bias_compute(
                    output_shape_zN, result, bias_value, out_format, *indices),
                name="output")
        elif out_format == "zZ":
            out = akg.tvm.compute(
                output_shape_zZ,
                lambda *indices: bias_compute(
                    output_shape_zZ, result, bias_value, out_format, *indices),
                name="output")
        if out_dtype == "float16" and bias_value.dtype == "float32":
            out = cast.cast(out, out_dtype)
    else:
        out = result

    return out
コード例 #23
0
ファイル: cross.py プロジェクト: zhuyawen/akg
def cross(x, y):
    """
    Compute cross product of x and y.

    Note:
        The first dim of x or y must be 3, it will be calculated as (two dims for example)
        .. math::
            res = x \\times y = \\left[ \\begin{matrix}
            l, & \\cdots \\\\ m, & \\cdots \\\\ n, & \\cdots
            \\end{matrix} \\right] \\times \\left[ \\begin{matrix}
            o, & \\cdots \\\\ p, & \\cdots \\\\ q, & \\cdots
            \\end{matrix} \\right] = \\left[ \\begin{matrix}
            mq-np, & \\cdots \\\\ no-lq, & \\cdots \\\\ lp-mo, & \\cdots \\\\
            \\end{matrix} \\right]

    Args:
        x (tvm.tensor.Tensor): Input tensor, only support float16, float32,
                               int32, int8, uint8.
        y (tvm.tensor.Tensor): Input tensor, must have the same shape and dtype
                               as x.

    Returns:
        A tvm.tensor.Tensor with the same shape and dtype as x.
    """
    vc_util.elemwise_shape_check(get_shape(y), get_shape(x))
    vc_util.elemwise_dtype_check(
        y.dtype, x.dtype,
        (vc_util.DtypeForDavinci.ALL_FLOAT) if utils.product_is_mini() \
            else (vc_util.DtypeForDavinci.FLOAT16,
                  vc_util.DtypeForDavinci.FLOAT32,
                  vc_util.DtypeForDavinci.INT32,
                  vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8))

    shape = get_shape(x)

    if shape[0] != 3:
        raise RuntimeError(
            "The first axis of input must be 3, actual input is %d" % shape[0])

    inp_dtype = x.dtype
    need_type_convert = inp_dtype in ("int8", "uint8")

    shape = get_shape(x)
    shp = shape[1:]

    if need_type_convert:
        x = cast(x, "float16")
        y = cast(y, "float16")

    a0b1 = tvm.compute(shp, lambda *i: x(0, *i) * y(1, *i), name="a0b1")
    a0b2 = tvm.compute(shp, lambda *i: x(0, *i) * y(2, *i), name="a0b2")
    a1b0 = tvm.compute(shp, lambda *i: x(1, *i) * y(0, *i), name="a1b0")
    a1b2 = tvm.compute(shp, lambda *i: x(1, *i) * y(2, *i), name="a1b2")
    a2b0 = tvm.compute(shp, lambda *i: x(2, *i) * y(0, *i), name="a2b0")
    a2b1 = tvm.compute(shp, lambda *i: x(2, *i) * y(1, *i), name="a2b1")

    res0 = tvm.compute(shp, lambda *i: a1b2(*i) - a2b1(*i), name="res0")
    res1 = tvm.compute(shp, lambda *i: a2b0(*i) - a0b2(*i), name="res1")
    res2 = tvm.compute(shp, lambda *i: a0b1(*i) - a1b0(*i), name="res2")

    res = tvm.compute(
        shape,
        lambda *i: tvm.expr.Select(
            i[0] == 0, res0(*i[1:]),
            tvm.expr.Select(i[0] == 1, res1(*i[1:]), res2(*i[1:]))),
        name='res')

    if need_type_convert:
        res = cast(res, inp_dtype)

    return res
コード例 #24
0
 def truncatemod_func(a, b):
     """function for truncatemod formula"""
     # For positive numbers, floor and trunc are equivalent
     return akg.topi.subtract(
         a, akg.topi.multiply(b, cast(floor(div(a, b)), b.dtype)))
コード例 #25
0
ファイル: smooth_l1_loss.py プロジェクト: zhuyawen/akg
def smooth_l1_loss(prediction, target, anchor_samples,
                   anchor_sample_correct=0, delta=1.0):
    """
    Smooth l1 loss.

    For each value x in `error=predictions-target`, the following is calculated:

    .. math::
        y = \\left\\{
            \\begin{array}{rcl}
                0.5 x^2, & & if \\left| x \\right| <= d \\\\
                0.5 d^2 + d \\cdot (\\left| x \\right| - d), & & if \\left| x \\right| > d
            \\end{array}
        \\right.

    `anchor_samples` acts as a condition for the loss.
    if anchor_samples == anchor_sample_correct, loss = 0, else loss=loss(attention pls)

    Args:
        prediction (tvm.tensor.Tensor): A float tensor of shape
            [batch_size, num_anchors, code_size] representing the (encoded)
            predicted locations of objects.
        target (tvm.tensor.Tensor): A float tensor of shape
            [batch_size, num_anchors, code_size]
            representing the regression targets
        anchor_samples (tvm.tensor.Tensor): A int tensor of shape [batch_size, num_anchors]
        anchor_sample_correct (int): int, the threshold of anchor_samples
        delta (float): float, the point where the loss function changes from a quadratic to linear.

    Returns:
        loss (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors] tensor
               representing the value of the loss function.
    """
    dim_info, _ = smooth_l1_loss_set_dim_func(
        prediction, target, anchor_samples, anchor_sample_correct, delta)
    attrs = {DIM: dim_info}

    prediction_dtype = prediction.dtype
    target_dtype = target.dtype
    anchor_samples_dtype = anchor_samples.dtype

    vc_util.elemwise_dtype_check(prediction_dtype, target_dtype,
                                 vc_util.DtypeForDavinci.ALL_FLOAT)
    vc_util.ops_dtype_check(anchor_samples_dtype,
                            [vc_util.DtypeForDavinci.INT8,
                             vc_util.DtypeForDavinci.INT32])

    if anchor_sample_correct > 5 or anchor_sample_correct < 0:
        raise ValueError("anchor_sample_correct attr only support [0,5]")

    # check shape dim
    prediction_shape = get_shape(prediction)
    if len(prediction_shape) != 3:
        raise RuntimeError("Prediction shape only support 3-dim!")

    target_shape = get_shape(target)
    if len(target_shape) != 3:
        raise RuntimeError("Target shape only support 3-dim!")

    anchor_samples_shape = get_shape(anchor_samples)
    if len(anchor_samples_shape) != 2:
        raise RuntimeError("weights shape only support 2-dim!")

    prediction_dtype_old = prediction_dtype

    if utils.product_is_mini() and prediction_dtype == 'float32':
        prediction = akg.topi.cast(prediction, "float16")
        target = akg.topi.cast(target, "float16")
        prediction_dtype = "float16"

    # cast anchor_samples to float type in order to use the vcmp instruction
    if anchor_samples.dtype.lower() != prediction_dtype.lower():
        anchor_samples = cast(anchor_samples, prediction_dtype)
    anchor_samples_dtype = anchor_samples.dtype.lower()

    coefficient = akg.tvm.const(0.5, dtype=prediction_dtype)
    delta = akg.tvm.const(delta, dtype=prediction_dtype)

    error = akg.topi.subtract(prediction, target)
    abs_error = akg.topi.abs(error)
    quadratic = akg.topi.minimum(abs_error, delta)
    linear = akg.topi.subtract(abs_error, quadratic)
    loss = akg.topi.add(akg.topi.multiply(coefficient, akg.topi.multiply(
        quadratic, quadratic)), akg.topi.multiply(delta, linear))
    loss = akg.topi.sum(loss, axis=-1)
    loss = akg.tvm.compute(loss.shape,
                           lambda *i:
                           akg.tvm.expr.Select(
                               anchor_samples(*i) == anchor_sample_correct,
                               akg.tvm.const(0, loss.dtype),
                               loss(*i)),
                           name="loss")

    if utils.product_is_mini() and prediction_dtype_old == 'float32':
        loss = akg.topi.cast(loss, prediction_dtype_old)

    return loss, attrs
コード例 #26
0
ファイル: conv.py プロジェクト: zhuyawen/akg
def conv(data,
         fmap_shape,
         filter_shape,
         pad,
         stride,
         dilation,
         use_bias=False,
         attrs=None,
         params=None):
    """
    Computes sums of 5-D convolutionis.

    Args:
        data (list[tvm.tensor.Tensor]): the size is 3 if use_bias else the size is 2;
              data[0] Tensor of type float16 ,shape 5D (fN, fC // C0, C0, fH, fW)
              data[1] Tensor of type float16 ,shape 4D (wC // C0 * wH * wW, wN // C0, C0, C0)
              data[2] Tensor of type float16 ,shape 5D (1, wN // C0, 1, 1, 16)
        fmap_shape (list[int]): [fN, fC, fH, fW]
        filter_shape (list[int]): [wN, wC, wH, wW]
        pad (list[int]): [pad_top, pad_bottom, pad_left, pad_right]
        stride (list[int]): [stride_h, stride_w]
        dilation (list[int]): [dilation_h, dilation_w]
        use_bias (bool): bool var.
        attrs (dict): dict with keys for example: conv_tile,bypass

    Returns:
        tvm.tensor.Tensor of same type as data, shape is 5D(oN, oC // C0, oH, oW, C0)
    """
    c_value = conv_core(data, fmap_shape, filter_shape, pad, stride, dilation,
                        use_bias, attrs)
    c_value = cast.cast(c_value, "float16")

    if use_bias:
        bias_value = data[2]
        output_bias_name = "output1"
        cube = akg.tvm.compute(c_value.shape,
                               lambda n, c1, h, w, c0: c_value[n, c1, h, w, c0]
                               + bias_value[0, c1, 0, 0, c0],
                               name=output_bias_name)
    else:
        cube = c_value

    block_size = 16
    dim_info, _, _, dynamic_ci_c1 = conv_set_dim_func(fmap_shape, filter_shape,
                                                      pad, stride, dilation,
                                                      use_bias, block_size,
                                                      attrs, conv_set_dim_map)

    all_dynamic = 0  # kh kw pad stride
    partial_dynamic = 0  # fn fc1 fh fw wN wC
    dynamic_tiling_full_dynamic = 1  # kh, kw, pad, stride are parameters if dynamic_tiling is enabled

    if attrs is None:
        attrs = {}
    if attrs.get("dynamic"):
        all_dynamic = 1
    if attrs.get("partial_dynamic"):
        partial_dynamic = 1
    dynamic = partial_dynamic or all_dynamic
    dynamic_tiling = 1 if attrs.get("dynamic") else 0

    if not dynamic:
        attrs = {
            "dim": dim_info,
            "pragma_reschedule": 1,
            "pragma_rmselfdep": 0
        }
    else:
        attrs = {
            "dim":
            dim_info,
            "pragma_reschedule":
            1,
            "pragma_rmselfdep":
            0,
            "enable_fix_loop_extent":
            0,
            "enable_post_poly_loop_partition":
            0,
            "enable_isolate_loop":
            0,
            "enable_isolate_min_max":
            1,
            "enable_conv_analyze_align":
            0,
            "enable_double_buffer":
            1,
            "enable_multicore":
            1,
            "enable_invariant_hoist":
            1,
            "pragma_keep_outer_band_order":
            1,
            "enable_algebra_simplify":
            1,
            "dynamic_shape_conv_full_parametric":
            dynamic_tiling and dynamic_tiling_full_dynamic,
        }
        attrs["pragma_outerband_need_split"] = 1
        attrs["pragma_is_conv"] = 1
        if dynamic_tiling:
            attrs["dynamic_shape"] = set_poly_upper_bound_for_tensor(
                data[0], 129, 1)  # pos 1 of data[0] is CI1 axis
        else:
            attrs["dynamic_shape"] = set_poly_upper_bound_for_tensor(
                data[0], dynamic_ci_c1 + 1, 1)  # pos 1 of data[0] is CI1 axis
        if dynamic_tiling:
            attrs["pragma_tilesize_is_var"] = 1
            attrs["enable_stride_kernel_op"] = 0

    return cube, attrs
コード例 #27
0
ファイル: conv_bn1.py プロジェクト: zhuyawen/akg
def conv_bn1(data,
             fmap_shape,
             filter_shape,
             pad,
             stride,
             dilation,
             use_bias=False,
             attrs=None):
    """
    Computes sums of 5-D convolutions and use convolution's fp32 result to compute first part of Fused_batch_norm.

    Fused_batch_norm's first part:

    \f[
     m = N \times H \times W \\
     \\mu_{tmp} = \\sum_{n, h, w}{\frac{x}{m}} \\
     \\sigma^2_{tmp} = \\sum_{n, h, w}{\frac{x^2}{m}}
    \f]

    Args:
        data (list[tvm.tensor.Tensor]): the size is 3 if use_bias else the size is 2;
              data[0] Tensor of type float16 ,shape 5D (fN, fC // C0, C0, fH, fW)
              data[1] Tensor of type float16 ,shape 4D (wC // C0 * wH * wW, wN // C0, C0, C0)
              data[2] Tensor of type float16 ,shape 5D (1, wN // C0, 1, 1, 16)
        fmap_shape (list[int]): [fN, fC, fH, fW]
        filter_shape (list[int]): [wN, wC, wH, wW]
        pad (list[int]): [pad_top, pad_bottom, pad_left, pad_right]
        stride (list[int]): [stride_h, stride_w]
        dilation (list[int]): [dilation_h, dilation_w]
        use_bias (bool): bool var.
        attrs (dict): dict with keys for example: conv_tile,bypass

    Returns:
        tvm.tensor.Tensor of same type as data, shape is 5D(oN, oC // C0, oH, oW, C0)
    """

    if use_bias:
        raise ValueError("do not support bias yet !!!")

    block_size = 16
    dim_info, conv_tile, bypass, _ = conv_set_dim_func(
        fmap_shape, filter_shape, pad, stride, dilation, use_bias, block_size,
        attrs, conv_bn1_set_dim_map)
    if attrs is None:
        attrs = {"conv_tile": conv_tile, "bypass": bypass}
    else:
        attrs['conv_tile'] = conv_tile
        attrs['bypass'] = bypass

    conv_res_32 = conv_core(data, fmap_shape, filter_shape, pad, stride,
                            dilation, use_bias, attrs)

    conv_res_16 = cast.cast(conv_res_32, "float16")

    axes = [3, 2, 0]
    conv_res_32_shape = [x.value for x in conv_res_32.shape]
    num = reduce(lambda i, j: i * j, [conv_res_32_shape[i] for i in axes])
    avg_num = round(float(1) / float(num), 12)

    res_sum = akg.topi.sum(conv_res_32, axes, keepdims=True)
    mean = akg.lang.cce.vmuls(res_sum, avg_num)

    res_square = akg.tvm.compute(conv_res_32.shape,
                                 lambda *i: conv_res_32[i] * conv_res_32[i],
                                 name="res_square")
    square_sum = akg.topi.sum(res_square, axes, keepdims=True)
    var_part = akg.lang.cce.vmuls(square_sum, avg_num)

    # need pragma_force_rmselfdep to enable multicore using atomic add
    # because default pragma_rmselfdep=1 will disable multicore of reduce axes
    attrs = {
        "dim": dim_info,
        "pragma_reschedule": 1,
        "enable_bisect_optimize": 0,
        "pragma_rmselfdep": 0,
        "pragma_force_rmselfdep": 1
    }

    return conv_res_16, var_part, mean, attrs
コード例 #28
0
def conv_backprop_input_compute(data,
                                output_shape,
                                filter_shape,
                                input_shape,
                                pad_,
                                stride_,
                                block_size=16,
                                attrs=None,
                                key=None):
    """core computation of conv_backprop_input."""
    _, in_c, w_h, w_w = filter_shape

    # stride (stride_h, stride_w)
    stride_h, stride_w = stride_
    if stride_h != stride_w:
        raise ValueError("stride_h must be equal to stride_w.")

    # output shape (NCHW -> NC1HWC0)
    in_nn, in_cc, in_hh, in_ww = output_shape
    if in_c % block_size != 0:
        raise ValueError("in_c must be divided by block_size.")
    input_shape_nc1hwc0 = (in_nn, in_cc // block_size, in_hh, in_ww,
                           block_size)
    in_nn, _, in_hh, in_ww, _ = input_shape_nc1hwc0
    input_trans_shape_nc1hwc0 = (in_nn, in_cc // block_size, in_hh * stride_h,
                                 in_ww * stride_w, block_size)
    in_n, in_c1, in_h, in_w, _ = input_trans_shape_nc1hwc0

    # kernel shape (NCHW -> NC1HWC0 -> Fractal)
    k_n, k_c, k_h, k_w = filter_shape
    if k_c % block_size != 0:
        raise ValueError("k_c must be divided by block_size.")
    kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
    k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0
    kernel_shape_trans = (k_n // block_size * k_h * k_w, k_c // block_size,
                          block_size, block_size)
    k_c1 = k_n // block_size
    k_n = k_c

    _, _, input_h, input_w = input_shape

    # padding ((padding_h, padding_w) -> (padding_top, padding_bottom, padding_left, padding_right))
    padding = (pad_[0], pad_[1], pad_[2], pad_[3])
    pad_t, pad_b, pad_l, pad_r = padding

    # padHT -> padHT'
    p_top = k_h - pad_t - 1
    # padHB -> padHB'
    p_bottom = input_h + pad_t - stride_h * (
        (input_h + pad_t + pad_b - k_h) // stride_h + 1)
    # padWL -> padWL'
    p_left = k_w - pad_l - 1
    # padWR -> padWR'
    p_right = input_w + pad_l - stride_w * (
        (input_w + pad_l + pad_r - k_w) // stride_w + 1)

    s_h = 1
    s_w = 1

    # NC1HWCO
    a_value = data[0]

    if data[1].dtype == 'float32':
        b_value = cast.cast(data[1], 'float16')
        tiling_args = cast_tiling_args
    else:
        b_value = data[1]
        tiling_args = conv_backprop_input_tiling_args

    # Create reduction variables
    kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1')
    kh = akg.tvm.reduce_axis((0, k_h), name='kh')
    kw = akg.tvm.reduce_axis((0, k_w), name='kw')
    kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0')
    use_auto_tiling = False
    if attrs is not None and 'conv_tile' in attrs and len(
            attrs['conv_tile']) >= 5:
        tile_value = attrs['conv_tile']
    elif key in tiling_args:
        tile_value = tiling_args[key]
    else:
        use_auto_tiling = True

    out_h = (in_h + p_top + p_bottom - k_h) // (s_h) + 1
    out_w = (in_w + p_left + p_right - k_w) // (s_w) + 1
    out_shape_nc1hwc0 = (in_n, k_n // block_size, out_h, out_w, block_size)
    out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0

    # set dim
    info = dim.Dim()
    index_ = 0

    if not use_auto_tiling:
        tile_hh = tile_value[0]
        if tile_hh == input_h:
            tile_hh += pad_t + pad_b

        tile_coco = tile_value[1]
        tile_coco = (tile_coco + block_size - 1) // block_size * block_size

        tile_mm = tile_value[2]
        tile_mm = (tile_mm + block_size - 1) // block_size * block_size

        tile_kk = tile_value[3]
        if not tile_kk % (block_size * w_h * w_w) == 0:
            logging.warning(
                "Warning: tile_k must be a multiple of (block_size * w_h * w_w)"
            )
        tile_kk = (tile_kk + block_size * w_h * w_w -
                   1) // (block_size * w_h * w_w) * (block_size * w_h * w_w)

        tile_nn = tile_value[4]
        tile_nn = (tile_nn + block_size - 1) // block_size * block_size

        tile_ww = input_w
        if len(tile_value) >= 6 and tile_value[5] > 0:
            tile_ww = tile_value[5]
        if tile_ww == input_w:
            tile_ww += pad_l + pad_r

        if tile_hh == in_h:
            tile_hh += p_top + p_bottom
        tile_out_h = (tile_hh - k_h) // s_h + 1

        if tile_ww == in_w:
            tile_ww += p_left + p_right
        tile_out_w = (tile_ww - k_w) // s_w + 1

        if tile_coco > 0:
            c1_cut = tile_coco // block_size
        else:
            c1_cut = out_c1

        if out_n > 1:
            info.setdim(index=index_, axis=0, tilel1=1, tilel0=0)  # n
        if out_c1 > 1:
            info.setdim(index=index_, axis=1, tilel1=c1_cut, tilel0=0)  # c1
        if out_h > 1:
            info.setdim(index=index_, axis="H", tilel1=tile_out_h,
                        tilel0=0)  # h
        if out_w > 1:
            info.setdim(index=index_, axis="W", tilel1=tile_out_w,
                        tilel0=0)  # w
        if out_c0 > 1:
            info.setdim(index=index_, axis=4, tilel1=out_c0, tilel0=0)  # c0
        if in_c1 > 1:
            info.setdim(index=index_, axis=5, tilel1=in_c1, tilel0=0)  # kc1
        if k_h > 1:
            info.setdim(index=index_, axis=5, tilel1=k_h, tilel0=0)  # kh
        if k_w > 1:
            info.setdim(index=index_, axis=5, tilel1=k_w, tilel0=0)  # kw

        info = str(info)
    else:
        info = ""
    # Compute the convolution below

    output_name = "output0"

    # weight_trans [ ko, no, ni, ki ]
    # weight_trans [ co_1, kh, kw, ci_1, ci_0, co_0 ]
    # kw = ko % k_w
    # kh = ko // k_w % k_h
    # co_1 = ko // k_w // k_h
    # ci_1 = no
    # -->
    # weight [ ci_1, kh', kw', co_1, co_0, ci_0 ]
    # weight [ no, k_h - ko // k_w % k_h - 1, k_w - ko % k_w - 1, ko // k_w // k_h, co_0, ci_0 ]
    b_trans = akg.tvm.compute(kernel_shape_trans,
                              lambda ko, no, ni, ki: b_value[
                                  ((no * k_h + k_h - 1 - ko // k_w % k_h) * k_w
                                   + k_w - 1 - ko % k_w), ko //
                                  (k_h * k_w), ki, ni],
                              name='B_trans')

    if ((stride_h > 1) or (stride_w > 1)):

        @akg.tvm.hybrid.script
        def data_trans_hybrid(output, inputs, const_zero):
            """Implements data_trans ( B[n, c1, h * strideH, w * strideW, c0] = A[n, c1, h, w, c0] )."""

            stride_h = output.shape[2] // inputs.shape[2]
            stride_w = output.shape[3] // inputs.shape[3]

            b = allocate(output.shape, output.dtype, 'local')
            for n in range(output.shape[0]):
                for c1 in range(output.shape[1]):
                    for h in range(output.shape[2]):
                        for w in range(output.shape[3]):
                            for c0 in range(output.shape[4]):
                                b[n, c1, h, w, c0] = const_zero
                                if h % stride_h == 0 and w % stride_w == 0:
                                    b[n, c1, h, w,
                                      c0] = inputs[n, c1, h // stride_h,
                                                   w // stride_w, c0]

            return b

        a_trans_init = akg.tvm.placeholder(input_trans_shape_nc1hwc0,
                                           dtype="float16",
                                           name='a_trans')
        const_zero = akg.tvm.const(0, 'float16')
        a_trans = data_trans_hybrid(a_trans_init, a_value, const_zero)
    else:
        a_trans = a_value
    conv_attrs = {
        "pragma_conv_kernel_n": k_n,
        "pragma_conv_kernel_h": k_h,
        "pragma_conv_kernel_w": k_w,
        "pragma_conv_padding_top": p_top,
        "pragma_conv_padding_bottom": p_bottom,
        "pragma_conv_padding_left": p_left,
        "pragma_conv_padding_right": p_right,
        "pragma_conv_bypass_l1": 0,
        "pragma_conv_backprop_input": 1,
        "pragma_conv_stride_h": s_h,
        "pragma_conv_stride_w": s_w,
        "pragma_conv_dilation_h": 1,
        "pragma_conv_dilation_w": 1,
        "pragma_conv_fm_n": in_n,
        "pragma_conv_fm_c": in_c,
        "pragma_conv_fm_h": in_h,
        "pragma_conv_fm_w": in_w,
        "feature": a_trans.op.name,
        "filter": b_value.op.name,
        "bias": 'None',
        "res": output_name
    }
    if not use_auto_tiling:
        conv_attrs["pragma_conv_h_cut"] = (tile_out_h - 1) * s_h + k_h
        conv_attrs["pragma_conv_w_cut"] = (tile_out_w - 1) * s_w + k_w
        conv_attrs["pragma_conv_co_cut"] = c1_cut * k_c0
        conv_attrs["pragma_conv_m_cut"] = tile_mm
        conv_attrs["pragma_conv_k_cut"] = tile_kk
        conv_attrs["pragma_conv_n_cut"] = tile_nn
    res_c = akg.tvm.compute(
        out_shape_nc1hwc0,
        lambda n, c1, h, w, c0: akg.lang.cce.mmad((akg.tvm.if_then_else(
            akg.tvm.any((h * s_h + kh) < p_top, (h * s_h + kh) >
                        (in_h + p_top - 1), (w * s_w + kw) < p_left,
                        (w * s_w + kw) >
                        (in_w + p_left - 1)), akg.tvm.const(0.0, 'float16'),
            a_trans[n, kc1, (h * s_h + kh - p_top),
                    (w * s_w + kw - p_left), kc0]) * b_trans[
                        (kc1 * k_h + kh) * k_w + kw, c1, c0, kc0]).astype(
                            "float32"),
                                                  axis=[kc1, kh, kw, kc0]),
        name=output_name,
        attrs=conv_attrs)

    res_c = cast.cast(res_c, "float16")

    return res_c, {"dim": info, "pragma_reschedule": 1, "pragma_rmselfdep": 0}